In [1]:
import sklearn.svm as sk
from sklearn.svm import LinearSVC, SVC
import sklearn.metrics as metrics
import sklearn.decomposition as decomp
import sklearn.preprocessing as preproc
import sklearn.pipeline as skp
import sklearn.model_selection as skmodel
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [6]:
def train_svm(train, test, cross_validate=None, k=10, dim_reduc=None, norms=True, kernel="LinearSVC", final_pred=False, get_coefs=False):
    """
    Function to train svm
    :param train: train data... (in panda dataframe)
    :param test: test data (itou)
    :param cross_validate: whether or not to perform cross validation (possible values: leave-one-out and k-fold)
    :param k: k parameter for k-fold cross validation
    :param dim_reduc: dimensionality reduction of input data. Implemented values are pca and som.
    :param norms: perform normalisations, i.e. z-scores and L2 (default True)
    :param kernel: kernel for SVM
    :param final_pred: do the final predictions?
    :param get_coefs, if true, writes to disk (coefficients.csv) and plots the most important coefficients for each class
    :return: returns a pipeline with a fitted svm model, and if possible prints evaluation and writes to disk:
    confusion_matrix.csv, misattributions.csv and (if required) FINAL_PREDICTIONS.csv
    """

    print(".......... Formatting data ........")
    # Save the classes
    classes = list(train.loc[:, 'author'])
    train = train.drop(['author', 'lang'], axis=1)
    train=train.drop(train.columns[0], axis=1)
    train["index"]=range(1,len(train)+1)

    if test is not None:
        classes_test = list(test.loc[:, 'author'])
        preds_index = list(df_test.iloc[:,0])
        test = test.drop(['author', 'lang'], axis=1)
        test=test.drop(test.columns[0], axis=1)
        test["index"]=range(1,len(test)+1)

    nfeats = train.columns.__len__()

    # CREATING PIPELINE
    print(".......... Creating pipeline according to user choices ........")
    estimators = []

    if norms:
        # Z-scores
        # TODO: me suis embeté à implémenter quelque chose qui existe
        # déjà via sklearn.preprocessing.StandardScaler()
        print(".......... using normalisations ........")
        estimators.append(('scaler', preproc.StandardScaler()))
        # NB: j'utilise le built-in
        # normalisation L2
        # cf. https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer

        estimators.append(('normalizer', preproc.Normalizer()))

    print(".......... choosing SVM ........")

    if kernel == "LinearSVC":
        # try a faster one
        estimators.append(('model', sk.LinearSVC()))
        # classif = sk.LinearSVC()

    else:
        estimators.append(('model', sk.SVC(kernel=kernel, probability=True)))
        # classif = sk.SVC(kernel=kernel)

    print(".......... Creating pipeline with steps ........")
    print(estimators)
    pipe = skp.Pipeline(estimators)
    
    
    
    if cross_validate is not None:
        if cross_validate == 'leave-one-out':
            myCV = skmodel.LeaveOneOut()

        if cross_validate == 'k-fold':
            myCV = skmodel.KFold(n_splits=k)

        print(".......... "+ cross_validate +" cross validation will be performed ........")
        print(".......... using " + str(myCV.get_n_splits(train)) + " samples ........")

        # Will need to
        # 1. train a model
        # 2. get prediction
        # 3. compute score: precision, recall, F1 for all categories

        preds = skmodel.cross_val_predict(pipe, train, classes, cv=myCV, verbose=1, n_jobs=-1)
        cv_results = skmodel.cross_validate(pipe, train, classes, cv=myCV, verbose=1, n_jobs=-1)

        # and now, leave one out evaluation (very small redundancy here, one line that could be stored elsewhere)
        unique_labels = list(set(classes))
        pd.DataFrame(metrics.confusion_matrix(classes, preds, labels=unique_labels),
                         index=['true:{:}'.format(x) for x in unique_labels],
                         columns=['pred:{:}'.format(x) for x in unique_labels]).to_csv("confusion_matrix.csv")

        print(metrics.classification_report(classes, preds))
        # writing misattributions
        pd.DataFrame([i for i in zip(list(train.index), list(classes), list(preds)) if i[1] != i[2] ],
                         columns=["id", "True", "Pred"]
                         ).set_index('id').to_csv("misattributions.csv")

        # and now making the model for final preds after leave one out if necessary
        if final_pred or get_coefs:
            print(".......... Training final SVM with all train set ........")
            pipe.fit(train, classes)

        if final_pred:
            preds = pipe.predict_proba(test)
    
        return pipe, cv_results

    
    # And now the simple case where there is only one svm to train
    else:
        pipe.fit(train, classes)
        preds = pipe.predict_proba(test)
        # and evaluate
        unique_labels = list(set(classes + classes_test))

        #pd.DataFrame(metrics.confusion_matrix(classes_test, preds, labels=unique_labels),
        #                 index=['true:{:}'.format(x) for x in unique_labels],
        #                 columns=['pred:{:}'.format(x) for x in unique_labels]).to_csv("confusion_matrix.csv")

        #print(metrics.classification_report(classes_test, preds))

    # AND NOW, we need to evaluate or create the final predictions
    if final_pred:
        print(".......... Writing final predictions to FINAL_PREDICTIONS.csv ........")
        # Get the decision function too
        myclasses = pipe.classes_
        decs = pipe.decision_function(test)
        #######ERROR --- resolu
        dists = {}
        dists["AU(-)/CAE(+)"] = decs

        pd.DataFrame(data={**{'filename': preds_index, 'AU_proba': list(preds), 'CAE_proba' : None, "AU(-)/CAE(+)":list(decs)}, **dists}).to_csv("FINAL_PREDICTIONS.csv")

    if get_coefs:
        # For “one-vs-rest” LinearSVC the attributes coef_ and intercept_ have the shape (n_classes, n_features) and
        # (n_classes,) respectively.
        # Each row of the coefficients corresponds to one of the n_classes “one-vs-rest” classifiers and similar for the
        # intercepts, in the order of the “one” class.
        # Save coefficients for the last model
        pandas.DataFrame(pipe.named_steps['model'].coef_,
                         index=pipe.classes_,
                         columns=train.columns).to_csv("coefficients.csv")

        # TODO: optionalise  the number of top_features… ?
        for i in range(len(pipe.classes_)):
            plot_coefficients(pipe.named_steps['model'].coef_[i], train.columns, pipe.classes_[i])

In [7]:
df_train=pd.read_csv("train.csv")
df_test=pd.read_csv("test.csv")

In [6]:
pipe_text_CV, scores_text_CV= train_svm(df_train, df_test, cross_validate='k-fold')

.......... Formatting data ........
.......... Creating pipeline according to user choices ........
.......... using normalisations ........
.......... choosing SVM ........
.......... Creating pipeline with steps ........
[('scaler', StandardScaler()), ('normalizer', Normalizer()), ('model', LinearSVC())]
.......... k-fold cross validation will be performed ........
.......... using 10 samples ........


[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.4s finished
[Parallel(n_jobs=-1)]: Using backend SequentialBackend with 1 concurrent workers.


              precision    recall  f1-score   support

          AU       0.96      0.96      0.96        46
         CAE       0.96      0.96      0.96        55

    accuracy                           0.96       101
   macro avg       0.96      0.96      0.96       101
weighted avg       0.96      0.96      0.96       101



[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.6s finished


In [174]:
pd.DataFrame(scores_text_CV)

Unnamed: 0,fit_time,score_time,test_score
0,0.224275,0.077451,1.0
1,0.220805,0.082177,1.0
2,0.241964,0.075271,1.0
3,0.157832,0.038972,0.9
4,0.126612,0.054724,1.0
5,0.141717,0.051384,1.0
6,0.112758,0.038979,1.0
7,0.114959,0.038072,1.0
8,0.105856,0.040657,1.0
9,0.118132,0.069238,1.0


In [175]:
sum(scores_text_CV['test_score'][:3])/len(scores_text_CV['test_score'][:3])

1.0

In [8]:
train_svm(df_train, df_test, final_pred=True, kernel="linear")

.......... Formatting data ........
.......... Creating pipeline according to user choices ........
.......... using normalisations ........
.......... choosing SVM ........
.......... Creating pipeline with steps ........
[('scaler', StandardScaler()), ('normalizer', Normalizer()), ('model', SVC(kernel='linear', probability=True))]
.......... Writing final predictions to FINAL_PREDICTIONS.csv ........
