In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils import shuffle
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, RFE, SelectFromModel
from sklearn.linear_model import LogisticRegression
pd.options.mode.chained_assignment = None
import warnings
warnings.filterwarnings('ignore')


In [2]:
plt.rcParams['figure.figsize'] = (24, 10)
plt.rcParams['font.size'] = 14

In [3]:
object_count = 30

In [4]:
# df = pd.read_table("./data/castle-or-lock.tsv")
df = pd.read_table("./data/SMS.tsv")
df

Unnamed: 0,class,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df.describe()

Unnamed: 0,class,text
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


In [6]:
df.text

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [7]:
vectorizer = TfidfVectorizer(min_df=3, token_pattern="([^\d\W]{4,})")
X = vectorizer.fit_transform(df.text)
vectorizer.get_feature_names_out()

array(['aathi', 'abiola', 'able', ..., 'yours', 'yourself', 'yummy'],
      dtype=object)

In [8]:
X

<5572x2106 sparse matrix of type '<class 'numpy.float64'>'
	with 35532 stored elements in Compressed Sparse Row format>

In [9]:
feature = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [10]:
feature

Unnamed: 0,aathi,abiola,able,about,abta,accept,access,accident,accidentally,accordingly,...,yest,yesterday,yetunde,yijue,yoga,yogasana,your,yours,yourself,yummy
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
feature.columns

Index(['aathi', 'abiola', 'able', 'about', 'abta', 'accept', 'access',
       'accident', 'accidentally', 'accordingly',
       ...
       'yest', 'yesterday', 'yetunde', 'yijue', 'yoga', 'yogasana', 'your',
       'yours', 'yourself', 'yummy'],
      dtype='object', length=2106)

In [12]:
feature.shape

(5572, 2106)

In [13]:
sel = VarianceThreshold(threshold=0.0003)
sel.fit_transform(feature)
feature = feature[feature.columns[sel.get_support(indices=True)]]
feature

Unnamed: 0,aathi,abiola,able,about,account,actually,address,advance,after,afternoon,...,wrong,xmas,yeah,year,years,yesterday,yijue,your,yours,yourself
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
feature.shape

(5572, 831)

In [15]:
target = df["class"]

In [16]:
X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.33, random_state=42)

In [17]:
X_train

Unnamed: 0,aathi,abiola,able,about,account,actually,address,advance,after,afternoon,...,wrong,xmas,yeah,year,years,yesterday,yijue,your,yours,yourself
3235,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5528,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def pearson(feature, target):
    tmp = feature.copy()
    tmp["class"] = target
    tmp["target"] = pd.factorize(tmp["class"])[0]
    corr = tmp.corr()
    return list(corr["target"].drop("target").apply(abs).sort_values(ascending=False)[:30].index)

In [19]:
feature_columns_pearson = pearson(feature, target)
feature_columns_pearson

['claim',
 'mobile',
 'prize',
 'free',
 'call',
 'stop',
 'guaranteed',
 'urgent',
 'service',
 'cash',
 'nokia',
 'reply',
 'pobox',
 'contact',
 'awarded',
 'tone',
 'text',
 'code',
 'draw',
 'rate',
 'customer',
 'landline',
 'from',
 'valid',
 'ringtone',
 'collection',
 'latest',
 'apply',
 'video',
 'line']

In [20]:
feature_pearson = feature[feature_columns_pearson]

In [57]:
def SVM_RFE(feature, target, k=object_count):
    params = feature.columns
    while len(params) != k:
        print(len(params))
        clf = SVC(kernel="linear")
        clf.fit(feature[params], target)
        pairs = sorted(list(zip(params, *np.absolute(clf.coef_))), key=lambda x: x[1], reverse=True)[:max(k, max(int(len(params) * 0.9), 5))]
        params = [e[0] for e in pairs]
    return list(params)

In [58]:
feature_columns_svm = SVM_RFE(feature, target)
feature_columns_svm

831
747
672
604
543
488
439
395
355
319
287
258
232
208
187
168
151
135
121
108
97
87
78
70
63
56
50
45
40
36
32


['mobile',
 'claim',
 'awarded',
 'service',
 'nokia',
 'video',
 'collection',
 'prize',
 'optout',
 'games',
 'http',
 'stop',
 'private',
 'ringtone',
 'weekly',
 'pobox',
 'apply',
 'delivery',
 'tone',
 'code',
 'reveal',
 'cash',
 'services',
 'tones',
 'voucher',
 'custcare',
 'rate',
 'award',
 'chat',
 'quiz']

In [59]:
feature_svm = feature[feature_columns_svm]

In [24]:
from tqdm import tqdm

def sfs(feature_train, feature_test, target_train, target_test, k=object_count):
    params = []
    q = set(feature_train.columns)
    for i in range(k):
        score = float("inf")
        add = None
        for j in tqdm(q):
            clf = DecisionTreeClassifier()
            current_features = feature_train[params + [j]]
            clf.fit(current_features, target_train)
            current_score = accuracy_score(target_test, clf.predict(feature_test[params + [j]]))
            if current_score < score:
                score = current_score
                add = j
        params.append(add)
        q.remove(add)

    return params

In [25]:
feature_columns_sfs = sfs(X_train, X_test, y_train, y_test)

100%|██████████| 831/831 [00:03<00:00, 233.16it/s]
100%|██████████| 830/830 [00:03<00:00, 227.46it/s]
100%|██████████| 829/829 [00:03<00:00, 231.56it/s]
100%|██████████| 828/828 [00:03<00:00, 214.84it/s]
100%|██████████| 827/827 [00:04<00:00, 200.44it/s]
100%|██████████| 826/826 [00:03<00:00, 206.66it/s]
100%|██████████| 825/825 [00:04<00:00, 196.76it/s]
100%|██████████| 824/824 [00:04<00:00, 181.57it/s]
100%|██████████| 823/823 [00:05<00:00, 160.30it/s]
100%|██████████| 822/822 [00:05<00:00, 152.87it/s]
100%|██████████| 821/821 [00:05<00:00, 136.88it/s]
100%|██████████| 820/820 [00:06<00:00, 125.42it/s]
100%|██████████| 819/819 [00:07<00:00, 116.90it/s]
100%|██████████| 818/818 [00:07<00:00, 113.59it/s]
100%|██████████| 817/817 [00:07<00:00, 109.30it/s]
100%|██████████| 816/816 [00:08<00:00, 95.82it/s] 
100%|██████████| 815/815 [00:08<00:00, 95.04it/s] 
100%|██████████| 814/814 [00:09<00:00, 85.50it/s]
100%|██████████| 813/813 [00:10<00:00, 79.45it/s]
100%|██████████| 812/812 [00:10<0

In [26]:
feature_columns_sfs

['want',
 'guys',
 'calls',
 'station',
 'trip',
 'girls',
 'time',
 'will',
 'last',
 'today',
 'well',
 'life',
 'true',
 'shopping',
 'money',
 'round',
 'know',
 'dont',
 'same',
 'crazy',
 'getting',
 'maybe',
 'into',
 'film',
 'wanna',
 'were',
 'happened',
 'half',
 'babe',
 'super']

In [27]:
X_train_sfs = X_train[feature_columns_sfs]
X_test_sfs = X_test[feature_columns_sfs]

In [28]:
selector = SelectFromModel(estimator=LogisticRegression()).fit(feature, target)
selector.estimator_.coef_

array([[-1.25246836e-01, -2.11906831e-01, -3.75232377e-01,
        -6.25888251e-01,  7.49276048e-01, -5.08039213e-01,
        -1.38056987e-01, -1.28996898e-01, -5.69383315e-01,
        -4.17050138e-01, -4.71012650e-01, -7.74406977e-01,
        -1.66697512e-01, -1.95947345e-01, -2.97285643e-01,
         3.60826269e-01, -8.97981007e-01, -5.55849557e-01,
        -1.52255330e-01, -6.40378315e-01, -9.70612018e-01,
        -4.35188898e-01, -3.08597706e-01,  1.02196136e+00,
        -2.34121416e-01, -2.62066653e-01, -9.02193370e-01,
         3.65311903e-01, -3.96518355e-01,  1.88851823e+00,
        -2.37225822e-01,  9.36548748e-01, -2.87837318e-01,
        -1.54740230e-01,  4.56023512e-01, -2.45592897e-01,
        -8.04479284e-02, -2.44133282e-01,  1.39401420e+00,
         1.78505224e+00, -1.09353380e-01,  8.81408755e-01,
        -1.14633963e-01,  1.75216297e+00,  2.28713678e+00,
        -1.56446624e-02, -4.11488953e-01,  5.33338579e-02,
        -1.76109444e-01,  2.74016731e-01,  3.64361288e-0

In [29]:
selector.get_support()

array([False, False, False,  True,  True, False, False, False,  True,
       False, False,  True, False, False, False, False,  True,  True,
       False,  True,  True, False, False,  True, False, False,  True,
       False, False,  True, False,  True, False, False, False, False,
       False, False,  True,  True, False,  True, False,  True,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
        True, False, False, False, False, False,  True,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False,  True, False,
       False, False,  True, False,  True, False, False,  True, False,
       False, False, False,  True, False, False,  True, False,  True,
        True, False, False, False, False, False,  True, False, False,
        True,  True, False,  True,  True, False,  True,  True, False,
       False,  True,

In [30]:
feature_from_model = feature[feature.columns[selector.get_support(indices=True)]]
feature_from_model

Unnamed: 0,about,account,after,aight,already,alright,also,always,answer,anything,...,wont,word,work,working,worth,xmas,yeah,years,your,yours
0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.56728,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
feature_from_model.columns[:30]

Index(['about', 'account', 'after', 'aight', 'already', 'alright', 'also',
       'always', 'answer', 'anything', 'apply', 'area', 'attempt', 'auction',
       'await', 'award', 'awarded', 'been', 'bill', 'bonus', 'book', 'call',
       'calls', 'camera', 'cant', 'cash', 'chance', 'charge', 'chat',
       'choose'],
      dtype='object')

In [32]:
sel_chi2 = SelectKBest(chi2, k=object_count)
sel_chi2.fit(feature, target)
feature_chi2 = feature[feature.columns[sel_chi2.get_support(indices=True)]]
feature_chi2

Unnamed: 0,apply,award,awarded,call,cash,claim,code,collection,contact,customer,...,reply,ringtone,service,stop,text,tone,tones,urgent,valid,video
0,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.251673,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.182704,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.000000,0.0,0.0,0.156881,0.0,0.233973,0.0,0.0,0.255654,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
5568,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
5569,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
5570,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [33]:
feature_chi2.columns

Index(['apply', 'award', 'awarded', 'call', 'cash', 'claim', 'code',
       'collection', 'contact', 'customer', 'draw', 'entry', 'free',
       'guaranteed', 'landline', 'mobile', 'nokia', 'pobox', 'prize', 'rate',
       'reply', 'ringtone', 'service', 'stop', 'text', 'tone', 'tones',
       'urgent', 'valid', 'video'],
      dtype='object')

In [35]:
selector_RFE = RFE(RandomForestClassifier(), n_features_to_select=30, step=0.1)
selector_RFE.fit(feature, target)
feature_forest_RFE = feature[feature.columns[selector_RFE.get_support(indices=True)]]
feature_forest_RFE

Unnamed: 0,awarded,call,cash,chat,claim,contact,cost,draw,free,from,...,reply,send,service,stop,text,that,tone,urgent,with,your
0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.171241,0.0,...,0.0,0.0,0.0,0.0,0.182704,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.156881,0.0,0.0,0.233973,0.255654,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
5568,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
5569,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.341614,0.0,0.0,0.0,0.0
5570,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.224395,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0


In [36]:
def tree(feature_train, feature_test, target_train):
    clf = DecisionTreeClassifier()
    clf.fit(feature_train, target_train)
    return clf.predict(feature_test)

def SVM(feature_train, feature_test, target_train):
    clf = SVC()
    clf.fit(feature_train, target_train)
    return clf.predict(feature_test)

def kneigh(feature_train, feature_test, target_train):
    clf = KNeighborsClassifier()
    clf.fit(feature_train, target_train)
    return clf.predict(feature_test)

In [40]:
def check_clf(feature_full_train, feature_full_test, feature_cut_train, feature_cut_test, target_train, target_test, label=""):
    print(label)
    print(f"Tree. Old: {accuracy_score(target_test, tree(feature_full_train, feature_full_test, target_train))}, New: {accuracy_score(target_test, tree(feature_cut_train, feature_cut_test, target_train))}")
    print(f"SVM. Old: {accuracy_score(target_test, SVM(feature_full_train, feature_full_test, target_train))}, New: {accuracy_score(target_test, SVM(feature_cut_train, feature_cut_test, target_train))}")
    print(f"KNeigh. Old: {accuracy_score(target_test, kneigh(feature_full_train, feature_full_test, target_train))}, New: {accuracy_score(target_test, kneigh(feature_cut_train, feature_cut_test, target_train))}")


In [60]:
x_train_from, x_test_from, y_train_from, y_test_from = train_test_split(feature_from_model, target, test_size=0.33, random_state=42)
x_train_chi2, x_test_chi2, y_train_chi2, y_test_chi2 = train_test_split(feature_chi2, target, test_size=0.33, random_state=42)
x_train_forest, x_test_forest, y_train_forest, y_test_forest = train_test_split(feature_forest_RFE, target, test_size=0.33, random_state=42)
x_train_svm, x_test_svm, y_train_svm, y_test_svm = train_test_split(feature_svm, target, test_size=0.33, random_state=42)
x_train_pearson, x_test_pearson, y_train_pearson, y_test_pearson = train_test_split(feature_pearson, target, test_size=0.33, random_state=42)


In [61]:
check_clf(X_train, X_test, X_train_sfs, X_test_sfs, y_train, y_test, "SFS")
check_clf(X_train, X_test, x_train_svm, x_test_svm, y_train_svm, y_test_svm, "SVM-RFE")
check_clf(X_train, X_test, x_train_pearson, x_test_pearson, y_train_pearson, y_test_pearson, "Pearson")
check_clf(X_train, X_test, x_train_from, x_test_from, y_train_from, y_test_from, "From model")
check_clf(X_train, X_test, x_train_chi2, x_test_chi2, y_train_chi2, y_test_chi2, "SelectKBest-Chi2")
check_clf(X_train, X_test, x_train_forest, x_test_forest, y_train_forest, y_test_forest, "Forest-RFE")

SFS
Tree. Old: 0.957041870581838, New: 0.8123980424143556
SVM. Old: 0.9787928221859706, New: 0.8613376835236541
KNeigh. Old: 0.9282218597063622, New: 0.8510059815116912
SVM-RFE
Tree. Old: 0.9575856443719413, New: 0.9651984774333877
SVM. Old: 0.9787928221859706, New: 0.967373572593801
KNeigh. Old: 0.9282218597063622, New: 0.9662860250135944
Pearson
Tree. Old: 0.9564980967917346, New: 0.9472539423599783
SVM. Old: 0.9787928221859706, New: 0.9564980967917346
KNeigh. Old: 0.9282218597063622, New: 0.9483414899401849
From model
Tree. Old: 0.9564980967917346, New: 0.9564980967917346
SVM. Old: 0.9787928221859706, New: 0.977705274605764
KNeigh. Old: 0.9282218597063622, New: 0.9363784665579119
SelectKBest-Chi2
Tree. Old: 0.9543230016313213, New: 0.945078847199565
SVM. Old: 0.9787928221859706, New: 0.9564980967917346
KNeigh. Old: 0.9282218597063622, New: 0.9494290375203915
Forest-RFE
Tree. Old: 0.9559543230016313, New: 0.9603045133224578
SVM. Old: 0.9787928221859706, New: 0.9619358346927678
KNeigh