In [88]:
import pandas
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

data = pandas.read_csv('drug200.csv')
data_sel = data.loc[:, data.columns.isin(['Drug', 'Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K'])]
data_sel = data_sel.dropna()

#Обработаем столбец Sex. 0 - женщина, 1 - мужчина
data_sel['Sex'] = np.where(data_sel['Sex'] == 'F', 0, 1)

#Обработаем столбец BP. -1 - низкий, 0 - нормальный, 1 - высокий
data_sel['BP'] = np.where(data_sel['BP'] == 'LOW', 2, data_sel['BP'])
data_sel['BP'] = np.where(data_sel['BP'] == 'NORMAL', 0, data_sel['BP'])
data_sel['BP'] = np.where(data_sel['BP'] == 'HIGH', 1, data_sel['BP'])

#Обработаем столбец Cholesterol. 0 - нормальный, 1 - высокий
data_sel['Cholesterol'] = np.where(data_sel['Cholesterol'] == 'NORMAL', 0, 1)

#Обработаем столбец Drug. DrugA – класс 0, остальные уровни – класс 1
data_sel['Drug'] = np.where(data_sel['Drug'] == 'drugX', 0, 1)

Drug = data_sel.loc[:, data_sel.columns.isin(['Drug'])]

X = data_sel.loc[:, data_sel.columns.isin(['Age', 'Sex', 'BP', 'Cholesterol', 'Na_to_K'])]

#Выводим изменённую таблицу
X

Unnamed: 0,Age,Sex,BP,Cholesterol,Na_to_K
0,23,0,1,1,25.355
1,47,1,2,1,13.093
2,47,1,2,1,10.114
3,28,0,0,1,7.798
4,61,0,2,1,18.043
...,...,...,...,...,...
195,56,0,2,1,11.567
196,16,1,2,1,12.006
197,52,1,0,1,9.894
198,23,1,0,0,14.020


In [89]:
#Делим данные на обучающую и тестовую выборку
x_train, x_validation, y_train, y_validation = train_test_split(X, Drug, test_size=.33, random_state=5)

In [90]:

logistic = LogisticRegression(solver='lbfgs')

logistic.fit(x_train, y_train.values.ravel())

logistic_pred = logistic.predict(x_validation)

print("Logistic Regression Test Accuracy: " + str(logistic.score(x_validation, y_validation)*100) + "%")
print('Report for Logistic Regression: ')
print(classification_report(y_validation, logistic_pred))

Logistic Regression Test Accuracy: 84.84848484848484%
Report for Logistic Regression: 
              precision    recall  f1-score   support

           0       0.71      0.62      0.67        16
           1       0.88      0.92      0.90        50

    accuracy                           0.85        66
   macro avg       0.80      0.77      0.78        66
weighted avg       0.84      0.85      0.84        66



In [91]:
#Cтроим классификатор Случайный Лес(Random Forest)


param_grid = { 'n_estimators': [200, 300, 400],'max_features': ['auto'],'max_depth' : list(range(1, 20)), 'criterion' :['gini']}

RFC = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv= 5, refit = True) 
RFC.fit(x_train, y_train.values.ravel())

#Оцениваем точность классификатора Случайный Лес с помощью метрик accuracy, precision, recall и F1
print("accuracy:"+str(np.average(cross_val_score(RFC.best_estimator_, x_validation, y_validation, scoring='accuracy'))))
print("f1:"+str(np.average(cross_val_score(RFC.best_estimator_, x_validation, y_validation, scoring='f1'))))
print("precision:"+str(np.average(cross_val_score(RFC.best_estimator_, x_validation, y_validation, scoring='precision'))))
print("recall:"+str(np.average(cross_val_score(RFC.best_estimator_, x_validation, y_validation, scoring='recall'))))

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


accuracy:0.924838446577577


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


f1:0.9618566618566619


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


precision:0.9256965944272446


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


recall:1.0


In [None]:
#Сравним метрики обоих классификаторов

#Логистическая регрессия
accuracy:84.84848484848484
f1:0.84
precision:0.84
recall:0.85
    
#Случайный лес
accuracy:0.9399899617290922
f1:0.9517726282432165
precision:0.926785919045981
recall:1.0
    
#Метрики f1 и recall лучше у Случайного Леса, значит он показывает себя лучше на этих данных, чем Логистическая регрессия