In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('./Social_Network_Ads.csv')

Первый столбец нам не нужен, поэтому удаляем его

In [3]:
dataset = dataset.drop(columns=['User ID'])
dataset.head(5)

Unnamed: 0,Gender,Age,EstimatedSalary,Purchased
0,Male,19,19000,0
1,Male,35,20000,0
2,Female,26,43000,0
3,Female,27,57000,0
4,Male,19,76000,0


Многие алгоритмы МО требуеют непрерывные данные. Преобразуем наши данные с помощью унитарного кодирования, это один из самых эффективных способов

In [4]:
from sklearn import preprocessing

enc = preprocessing.OneHotEncoder()  # кодирование признаков как числовой массив
enc.fit(dataset.iloc[:,[0]])  # все строки, но только первый столбец
onehottables = enc.transform(dataset.iloc[:, [0]]).toarray()
genders = pd.DataFrame({'Female': onehottables[:,0], 'Male': onehottables[:, 1]})
result = pd.concat([genders, dataset.iloc[:, 1:]], axis=1, sort=False)
result.head(5)

Unnamed: 0,Female,Male,Age,EstimatedSalary,Purchased
0,0.0,1.0,19,19000,0
1,0.0,1.0,35,20000,0
2,1.0,0.0,26,43000,0
3,1.0,0.0,27,57000,0
4,0.0,1.0,19,76000,0


Определим метки и признаки. Х - признаки, y - метка

In [5]:
y = result['Purchased']
X = result.drop(columns=['Purchased'])

Разделим данные на контрольные (25%) и обучающие (75%)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

Нормализуем признаки. Это требуется для большинства моделей

In [7]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [8]:
X_train

array([[ 0.98019606, -0.98019606,  0.58164944, -0.88670699],
       [-1.02020406,  1.02020406, -0.60673761,  1.46173768],
       [ 0.98019606, -0.98019606, -0.01254409, -0.5677824 ],
       ...,
       [-1.02020406,  1.02020406, -0.21060859, -0.50979612],
       [ 0.98019606, -0.98019606, -1.10189888, -0.45180983],
       [ 0.98019606, -0.98019606, -1.20093113,  1.40375139]])

# Алгоритм дерева решений

In [10]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion='entropy', random_state=100, max_depth=2)
classifier.fit(X_train, y_train)

Выведем все метрики

In [11]:
import sklearn.metrics as metrics

y_pred = classifier.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[64,  4],
       [ 2, 30]])

Рассчитаем accurecy, recall и precision

In [12]:
accurecy = metrics.accuracy_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
print(accurecy, recall, precision)

0.94 0.9375 0.8823529411764706


# XGBoost

In [13]:
from xgboost import XGBClassifier

classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [14]:
y_pred = classifier.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[64,  4],
       [ 4, 28]])

In [15]:
accurecy = metrics.accuracy_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
print(accurecy, recall, precision)

0.92 0.875 0.875


# Алгоритм случайного леса

**n_estimators** - количество деревьев

**max_depth** - глубина каждого дерева

In [16]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators=10, max_depth=4, criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)

In [17]:
y_pred = classifier.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[64,  4],
       [ 3, 29]])

In [18]:
accurecy = metrics.accuracy_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
print(accurecy, recall, precision)

0.93 0.90625 0.8787878787878788


# Логистическая регрессия

In [19]:
from sklearn.linear_model import LogisticRegression

classifier = LogisticRegression(random_state=0)
classifier.fit(X_train, y_train)

In [20]:
y_pred = classifier.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[65,  3],
       [ 6, 26]])

In [21]:
accurecy = metrics.accuracy_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
print(accurecy, recall, precision)

0.91 0.8125 0.896551724137931


# SVM

In [22]:
from sklearn.svm import SVC

classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

In [23]:
y_pred = classifier.predict(X_test)
cm = metrics.confusion_matrix(y_test, y_pred)
cm

array([[66,  2],
       [ 9, 23]])

In [24]:
accurecy = metrics.accuracy_score(y_test, y_pred)
recall = metrics.recall_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
print(accurecy, recall, precision)

0.89 0.71875 0.92


# Наивный Байесовский алгоритм

In [None]:
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)