In [1]:
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from skimage.transform import pyramid_gaussian
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
# import matplotlib.pyplot as plt
from inspect import signature

# print(glob.glob("../ILIYAN Master Thesis/Dataset/*"))

%matplotlib inline
%precision 2

'%.2f'

In [2]:
import time
start_time = time.time()

In [3]:
df = pd.read_csv('dataset/word2vec/tfidf_stem_3.csv', index_col=0)
df_t1 = pd.read_csv('dataset/w2v_fs/df_top_1_stem_3.csv', index_col=0)
df_t1t2 = pd.read_csv('dataset/w2v_fs/df_top_1_and_2_stem_3.csv', index_col=0)

In [4]:
df_class = pd.read_csv('dataset/features_norm.csv', index_col=0)
df['Class'] = df_class['Class']
df_t1['Class'] = df_class['Class']
df_t1t2['Class'] = df_class['Class']

In [5]:
def draw_confusionmatrix(y_test, y_hat):
    plt.figure(figsize=(10,7))
    cm = confusion_matrix(y_test, y_hat)
    ax = sns.heatmap(cm, annot=True, fmt="d")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    acc = accuracy_score(y_test, y_hat)
    print(f"Sum Axis-1 as Classification accuracy: {acc}")
    print('\n')
    print(classification_report(y_test, y_hat))
    print('\n')

In [6]:
def gen_train_and_test(df, test_size=0.20, random_state=42):
    X = df.loc[:, df.columns != 'Class']
    y = df.Class
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    return X_train, X_test, y_train, y_test

In [7]:
def split_train_and_test(X, y, test_size=0.20, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)    
    return X_train, X_test, y_train, y_test

In [22]:
def start_model(df, model, cv=10):
    X = df.loc[:, df.columns != 'Class']
    y = df.Class
    print(" avg accuracy:"+str(np.average(cross_val_score(model, X, y, scoring='accuracy', cv=cv))))
    print("       avg f1:"+str(np.average(cross_val_score(model, X, y, scoring='f1', cv=cv))))
    print("avg precision:"+str(np.average(cross_val_score(model, X, y, scoring='precision', cv=cv))))
    print("   avg recall:"+str(np.average(cross_val_score(model, X, y, scoring='recall', cv=cv))))

In [9]:
def start_grid_cv(df, cv=10, n_jobs=-1):
    param_grid = {
                  'kernel':('linear', 'rbf'),
                  'C':(0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 1, 10),
                  'gamma': (0.001, 0.01, 0.1, 1,2,3,'auto'),
                  'decision_function_shape':('ovo','ovr'),
                  'shrinking':(True,False)
                 }
    X_train, X_test, y_train, y_test = gen_train_and_test(df)
    clf = GridSearchCV(estimator=svm.SVC(), param_grid=param_grid, cv=cv, n_jobs=n_jobs)
    clf.fit(X_train, y_train)
    
    print("avg accuracy:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='accuracy'))))
    print("avg f1:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='f1'))))
    print("avg precision:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='precision'))))
    print("avg recall:"+str(np.average(cross_val_score(clf, X_train, y_train, scoring='recall'))))
    
    best_params = clf.best_params_
    print(best_params)
    
    return best_params

### Using "df"

In [10]:
params = start_grid_cv(df, cv=10)

avg accuracy:0.549450549451
avg f1:0.709090909091
avg precision:0.549450549451
avg recall:1.0
{'C': 0.001, 'decision_function_shape': 'ovo', 'gamma': 0.001, 'kernel': 'linear', 'shrinking': True}


In [23]:
params = {'C': 0.001, 'decision_function_shape': 'ovo', 'gamma': 0.001, 'kernel': 'linear', 'shrinking': True}
svm_gscv=svm.SVC(kernel=params['kernel'], C=params['C'], decision_function_shape=params['decision_function_shape'], gamma=params['gamma'], shrinking=params['shrinking'])

start_model(df, model=svm_gscv)

 avg accuracy:0.56
       avg f1:0.716666666667
avg precision:0.56
   avg recall:1.0


### Using "df_t1"

In [13]:
params = start_grid_cv(df_t1, cv=10)

avg accuracy:0.824175824176
avg f1:0.857843137255
avg precision:0.784259259259
avg recall:0.958333333333
{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'rbf', 'shrinking': True}


In [24]:
params = {'C': 10, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'rbf', 'shrinking': True}
svm_gscv=svm.SVC(kernel=params['kernel'], C=params['C'], decision_function_shape=params['decision_function_shape'], gamma=params['gamma'], shrinking=params['shrinking'])

start_model(df_t1, model=svm_gscv)

 avg accuracy:0.938333333333
       avg f1:0.951428571429
avg precision:0.916666666667
   avg recall:1.0


### Using "df_t1t2"

In [16]:
params = start_grid_cv(df_t1t2, cv=10)

avg accuracy:0.873626373626
avg f1:0.894444444444
avg precision:0.851851851852
avg recall:0.958333333333
{'C': 10, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'rbf', 'shrinking': True}


In [25]:
params = {'C': 10, 'decision_function_shape': 'ovo', 'gamma': 1, 'kernel': 'rbf', 'shrinking': True}
svm_gscv=svm.SVC(kernel=params['kernel'], C=params['C'], decision_function_shape=params['decision_function_shape'], gamma=params['gamma'], shrinking=params['shrinking'])

start_model(df_t1t2, model=svm_gscv)

 avg accuracy:0.918333333333
       avg f1:0.940714285714
avg precision:0.901666666667
   avg recall:1.0


In [19]:
elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

'00:18:54'