In [141]:
import pandas as pd
import numpy as np

In [142]:
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [143]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, accuracy_score

In [144]:
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomTreesEmbedding

In [145]:
data = pd.read_csv('/Users/egorsolovev/Downloads/behavior.csv')

In [146]:
data['tag']

0        0
1        0
2        1
3        0
4        1
        ..
49995    0
49996    1
49997    0
49998    1
49999    1
Name: tag, Length: 50000, dtype: int64

In [147]:
data.head()

Unnamed: 0.1,Unnamed: 0,hkey_local_machine\\software\microsoft\windows\currentversion\uninstall\mobileoptionpack,hkey_classes_root\\clsid\{148bd52a-a2ab-11ce-b11f-00aa00530503},hkey_current_user\\software\microsoft\windows\currentversion\urlmon settings,hkey_local_machine\\system\currentcontrolset\services\tcpip\parameters\interfaces\ms tcp loopback interface,hkey_local_machine\\software\microsoft\net framework setup\ndp\v3.5,hkey_local_machine\\software\microsoft\windows nt\currentversion\winlogon,hkey_local_machine\\software\microsoft\windows\currentversion\internet settings\user agent,hkey_local_machine\\system\currentcontrolset\services\dnscache\parameters,hkey_current_user\\software\microsoft\windows\currentversion\explorer\mountpoints2\cpc\volume,...,c:\windows\system32\kernel32.dll,c:\documents and settings\username\application data\microsoft\cryptneturlcache\metadata\2bf68f4714092295550497dd56f57004,c:\windows\system32\shdocvw.dll,c:\windows\system32\psapi.dll,c:\windows\system32\cmd.exe,c:\windows\dll\mscorlib.pdb,\\.\vboxguest,unnamedfile,wdmaud.drv_file,tag
0,00005122106941E2A0B5A651249D71ADD11C561167F3F9...,0,0,0,0,0,2,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0000CF95A98B33F00F4CDCBD13017EAB7B9589BE79617D...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,00015A2AEE06230C7E12B8D1E79F345F90008F0DD3175B...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0001D0A0243A15D78F10DB87FD721E4F06F3B9892257FC...,0,0,0,0,0,4,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0001D6A56EBCF3C0DEF053DA01454BF23021DE7DF1FC48...,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [148]:
y = data['tag'].to_numpy()
X = data.drop(['tag', 'Unnamed: 0'], axis=1).to_numpy()

In [149]:
X.shape, y.shape

((50000, 853), (50000,))

In [150]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((40000, 853), (40000,), (10000, 853), (10000,))

# Стандартизация

In [151]:
scaler = StandardScaler()
X_train_trfm = scaler.fit_transform(X_train)
X_test_trfm = scaler.transform(X_test)

# Сужение пространства признаков

In [152]:
pca = PCA(0.90)
pca

PCA(n_components=0.9)

In [153]:
X_train_pca = pca.fit_transform(X_train_trfm)
X_test_pca = pca.transform(X_test_trfm)

# Просмотр особенно важных признаков

In [154]:
# number of components
n_pcs = pca.components_.shape[0]

# get the index of the most important feature on EACH component i.e. largest absolute value
# using LIST COMPREHENSION HERE
most_important = [np.abs(pca.components_[i]).argmax() for i in range(n_pcs)]

initial_feature_names = data.columns

# get the names
most_important_names = [initial_feature_names[most_important[i]] for i in range(n_pcs)]

# using LIST COMPREHENSION HERE AGAIN
dic = {'PC{}'.format(i+1): most_important_names[i] for i in range(n_pcs)}

# build the dataframe
df = pd.DataFrame(dic.items())

In [155]:
df

Unnamed: 0,0,1
0,PC1,FindWindowW
1,PC2,hkey_classes_root\\clsid\{aeb6717e-7e19-11d0-9...
2,PC3,c:\windows\system32\advapi32.dll
3,PC4,hkey_local_machine\\software\policies\microsof...
4,PC5,c:\program files\microsoft office\office10\win...
...,...,...
148,PC149,srclient
149,PC150,c:\windows\system32\uxtheme.dll
150,PC151,mprapi
151,PC152,mprapi


# Линейная классификация

In [156]:
ridge_clf = RidgeClassifier()
ridge_clf.fit(X_train_pca, y_train)

RidgeClassifier()

In [157]:
f1_score(y_test, ridge_clf.predict(X_test_pca))

0.7256123764503654

In [158]:
accuracy_score(y_test, ridge_clf.predict(X_test_pca))

0.7446

# Логистическая регрессия

In [159]:
lin_clf = LogisticRegression(max_iter=1e4, n_jobs=2)
lin_clf.fit(X_train_pca, y_train)

LogisticRegression(max_iter=10000.0, n_jobs=2)

In [160]:
f1_score(y_test, lin_clf.predict(X_test_pca))

0.7914493198322593

In [161]:
accuracy_score(y_test, lin_clf.predict(X_test_pca))

0.7961

# Решающее дерево

In [162]:
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train_pca, y_train)

DecisionTreeClassifier()

In [163]:
f1_score(y_test, tree_clf.predict(X_test_pca))

0.8678656634244815

In [164]:
accuracy_score(y_test, tree_clf.predict(X_test_pca))

0.856