In [20]:
import numpy as np
import csv
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve,auc
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [12]:
def readData(num):
    datas=[]
    labels=[]
    x_test=pd.read_csv('Results/Datasets/'+str(num)+'/Orig/'+'X_test_Orig_'+str(num)+'.csv').to_numpy()
    y_test=pd.read_csv('Results/Datasets/'+str(num)+'/Orig/'+'y_test_'+str(num)+'.csv').to_numpy()
    for i in range(10):
        data='X_Orig_'+str(num)+'_'+str(i+1)+'.csv'
        label='Y_'+str(num)+'_'+str(i+1)+'.csv'
        df1=pd.read_csv('Results/Datasets/'+str(num)+'/Orig/'+data)    
        df2=pd.read_csv('Results/Datasets/'+str(num)+'/Orig/'+label)
        datas.append(df1.to_numpy())
        labels.append(df2.to_numpy())
    data=datas[0]
    label=labels[0]
    for i in range(1,len(datas)):
        data=np.concatenate((data,datas[i]),axis=0)
        label=np.concatenate((label,labels[i]),axis=0)
    label2=np.reshape(label,(10245))
    np.random.seed(0)
    index_0=np.where(label2==-1)
    index_1=np.where(label2==1)
    mask1=np.random.choice(index_0[0],1609)
    mask2=index_1[0]
    mask=np.concatenate((mask1,mask2))
    label_3=label[mask]
    data_3=data[mask]
    return (x_test,y_test,data_3,label_3)

# Base dataset

In [14]:
(x_test0,y_test0,x_train0,y_train0)=readData(0)
print(x_test0.shape,y_test0.shape,x_train0.shape,y_train0.shape)

(2562, 4) (2562, 1) (3218, 4) (3218, 1)


In [15]:
def normData(x_train,x_test,epsilon=1e-7):
    mean=np.mean(x_train,axis=0)
    std=np.std(x_train,axis=0)
    x_train=(x_train-mean)/(std+epsilon)
    x_test=(x_test-mean)/(std+epsilon)
    return(x_train,x_test)

In [22]:
def svmCross(train_data,test_data):
    gamma=[1e-1,1,1e1]
    C=[1e2,1e3,1e4]
    for i in range(len(gamma)):
        for j in range(len(C)):
            clf=svm.SVC(kernel='rbf',C=C[j],gamma=gamma[i])
            scores=cross_val_score(clf,train_data,test_data,cv=10,scoring='accuracy')
            print('C: ',C[j],'gamma',gamma[i],'average: ',scores.mean())

In [16]:
x_train0_norm,x_test0_norm=normData(x_train0,x_test0)

In [23]:
svmCross(x_train0_norm,y_train0)

C:  100.0 gamma 0.1 average:  0.7162791149068323
C:  1000.0 gamma 0.1 average:  0.7541906055900621
C:  10000.0 gamma 0.1 average:  0.828167701863354
C:  100.0 gamma 1 average:  0.8738392857142857
C:  1000.0 gamma 1 average:  0.8816090838509318
C:  10000.0 gamma 1 average:  0.8872030279503106
C:  100.0 gamma 10.0 average:  0.8875155279503106
C:  1000.0 gamma 10.0 average:  0.8872069099378882
C:  10000.0 gamma 10.0 average:  0.8862732919254659


In [24]:
def testAccuracy(clf,test_x,test_y):
    pred=clf.predict(test_x)
    true=np.reshape(test_y,pred.shape)
    accuracy=np.sum(pred==true)/true.shape[0]
    print('test accuracy: ',accuracy)

In [34]:
def performance(clf,test_x,test_y):
    pred=clf.predict(test_x)
    true=np.reshape(test_y,pred.shape)
    accuracy=sklearn.metrics.accuracy_score(test_y,pred)
    precision=sklearn.metrics.precision_score(test_y,pred)
    recall=sklearn.metrics.recall_score(test_y,pred)
    f1=sklearn.metrics.f1_score(test_y,pred)
    pred=clf.decision_function(test_x)
    fpr,tpr,threshold=roc_curve(test_y,pred)
    auc=sklearn.metrics.auc(fpr,tpr)
    print('accuracy: ',accuracy)
    print('precision: ',precision)
    print('recall: ',recall)
    print('f1: ',f1)
    print('auc',auc)
    

In [35]:
#C=1000,gamma=10
clf0=svm.SVC(kernel='rbf',C=1000,gamma=10)
clf0.fit(x_train0_norm,y_train0)
performance(clf0,x_test0_norm,y_test0)

accuracy:  0.9352068696330992
precision:  0.8052256532066508
recall:  0.8014184397163121
f1:  0.8033175355450237
auc 0.8944365421193925


# BaseCampaign

In [37]:
(x_test1,y_test1,x_train1,y_train1)=readData(1)
print(x_test1.shape,y_test1.shape,x_train1.shape,y_train1.shape)

(2562, 46) (2562, 1) (3218, 46) (3218, 1)


In [38]:
x_train1_norm,x_test1_norm=normData(x_train1,x_test1)

In [39]:
svmCross(x_train1_norm,y_train1)

C:  100.0 gamma 0.1 average:  0.9027445652173913
C:  1000.0 gamma 0.1 average:  0.9027445652173913
C:  10000.0 gamma 0.1 average:  0.902123447204969
C:  100.0 gamma 1 average:  0.902123447204969
C:  1000.0 gamma 1 average:  0.9015023291925466
C:  10000.0 gamma 1 average:  0.8993284161490683
C:  100.0 gamma 10.0 average:  0.9015023291925466
C:  1000.0 gamma 10.0 average:  0.898707298136646
C:  10000.0 gamma 10.0 average:  0.8962208850931678


In [40]:
#c=1000,gamma=0.1
clf1=svm.SVC(kernel='rbf',C=1000,gamma=0.1)
clf1.fit(x_train1_norm,y_train1)
performance(clf1,x_test1_norm,y_test1)

accuracy:  0.9504293520686963
precision:  0.8737373737373737
recall:  0.817966903073286
f1:  0.8449328449328448
auc 0.903910490419398


# BaseCampaginBin

In [41]:
(x_test2,y_test2,x_train2,y_train2)=readData(2)
print(x_test2.shape,y_test2.shape,x_train2.shape,y_train2.shape)

(2562, 48) (2562, 1) (3218, 48) (3218, 1)


In [42]:
x_train2_norm,x_test2_norm=normData(x_train2,x_test2)

In [43]:
svmCross(x_train2_norm,y_train2)

C:  100.0 gamma 0.1 average:  0.9027445652173913
C:  1000.0 gamma 0.1 average:  0.9027445652173913
C:  10000.0 gamma 0.1 average:  0.9015023291925466
C:  100.0 gamma 1 average:  0.902123447204969
C:  1000.0 gamma 1 average:  0.9011917701863353
C:  10000.0 gamma 1 average:  0.8990178571428571
C:  100.0 gamma 10.0 average:  0.9008812111801243
C:  1000.0 gamma 10.0 average:  0.8983967391304347
C:  10000.0 gamma 10.0 average:  0.8962208850931678


In [45]:
#c=1000,gamma=1
clf2=svm.SVC(kernel='rbf',C=1000,gamma=1)
clf2.fit(x_train2_norm,y_train2)
performance(clf2,x_test2_norm,y_test2)

accuracy:  0.9480874316939891
precision:  0.8606965174129353
recall:  0.817966903073286
f1:  0.8387878787878787
auc 0.9003317871301518


# BaseBow

In [46]:
(x_test3,y_test3,x_train3,y_train3)=readData(3)
print(x_test3.shape,y_test3.shape,x_train3.shape,y_train3.shape)

(2562, 104) (2562, 1) (3218, 104) (3218, 1)


In [47]:
x_train3_norm,x_test3_norm=normData(x_train3,x_test3)

In [48]:
svmCross(x_train3_norm,y_train3)

C:  100.0 gamma 0.1 average:  0.8918730590062113
C:  1000.0 gamma 0.1 average:  0.8915625
C:  10000.0 gamma 0.1 average:  0.8915625
C:  100.0 gamma 1 average:  0.8831657608695652
C:  1000.0 gamma 1 average:  0.8831657608695652
C:  10000.0 gamma 1 average:  0.8819235248447205
C:  100.0 gamma 10.0 average:  0.8834763198757765
C:  1000.0 gamma 10.0 average:  0.8813024068322981
C:  10000.0 gamma 10.0 average:  0.8819235248447205


In [49]:
#c=1000, gamma=0.1
clf3=svm.SVC(kernel='rbf',C=1000,gamma=0.1)
clf3.fit(x_train3_norm,y_train3)
performance(clf3,x_test3_norm,y_test3)

accuracy:  0.9449648711943794
precision:  0.8578680203045685
recall:  0.7990543735224587
f1:  0.8274173806609547
auc 0.8890651715246625


# BaseBowBin

In [50]:
(x_test4,y_test4,x_train4,y_train4)=readData(4)
print(x_test4.shape,y_test4.shape,x_train4.shape,y_train4.shape)

(2562, 106) (2562, 1) (3218, 106) (3218, 1)


In [51]:
x_train4_norm,x_test4_norm=normData(x_train4,x_test4)

In [52]:
svmCross(x_train4_norm,y_train4)

C:  100.0 gamma 0.1 average:  0.89125
C:  1000.0 gamma 0.1 average:  0.89125
C:  10000.0 gamma 0.1 average:  0.8903202639751553
C:  100.0 gamma 1 average:  0.8822340838509317
C:  1000.0 gamma 1 average:  0.8816129658385095
C:  10000.0 gamma 1 average:  0.8803707298136647
C:  100.0 gamma 10.0 average:  0.8788179347826087
C:  1000.0 gamma 10.0 average:  0.8766440217391305
C:  10000.0 gamma 10.0 average:  0.8750912267080745


In [53]:
#c=1000, gamma=0.1
clf4=svm.SVC(kernel='rbf',C=1000,gamma=0.1)
clf4.fit(x_train4_norm,y_train4)
performance(clf4,x_test4_norm,y_test4)

accuracy:  0.9453551912568307
precision:  0.8618925831202046
recall:  0.7966903073286052
f1:  0.828009828009828
auc 0.8935512606695204
