In [8]:
%matplotlib inline
%pip install scipy
%pip install scikit-learn

import piplite
await piplite.install('scikit-learn')
import pandas as pd
import numpy as np

Dataset = pd.read_csv('diabetes.csv')

Dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.linear_model import LinearRegression

# Пайплайн
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('std_scaller', StandardScaler())
])

data = Dataset.drop('Outcome', axis = 1)
outcome = Dataset.Outcome

data_prepared = pipeline.fit_transform(data)

# Линейная регрессия
model = LinearRegression().fit(data_prepared,outcome)

In [10]:
some_data = data.iloc[:5]
some_output = outcome.iloc[:5]
data_prepared_some = pipeline.transform(some_data)

print("Линейная регрессия. Прогнозы:", model.predict(data_prepared_some))
print("Линейная регрессия. Метки:",list(some_output))

from sklearn.metrics import classification_report, confusion_matrix 

Линейная регрессия. Прогнозы: [ 0.65175729  0.00573265  0.73642449 -0.0219232   0.83318937]
Линейная регрессия. Метки: [1, 0, 1, 0, 1]


In [11]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

data_prepared = pipeline.transform(data)
data_predictions = model.predict(data_prepared)
linear_mse = mean_squared_error(outcome, data_predictions)
lin_rmse = np.sqrt(linear_mse)
print("Среднеквадратическое отклонение. Линейная регрессия:",lin_rmse)

# Дерево решений
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(data_prepared,outcome)

data_predictions = tree_regressor.predict(data_prepared)
linear_mse = mean_squared_error(outcome, data_predictions)
lin_rmse = np.sqrt(linear_mse)
print("Среднеквадратическое отклонение. Дерево решений:",lin_rmse)

Среднеквадратическое отклонение. Линейная регрессия: 0.39785855691820504
Среднеквадратическое отклонение. Дерево решений: 0.0


In [12]:
linearRegressor = LinearRegression()
linearRegressor.get_params()

{'copy_X': True,
 'fit_intercept': True,
 'n_jobs': None,
 'normalize': 'deprecated',
 'positive': False}

In [13]:
from sklearn.model_selection import GridSearchCV
# Гиперпараметры в линейной регрессии
param_grid = [
    {
        'copy_X':[False], 'fit_intercept':[False],
    }, 
    {
        'positive':[True],'n_jobs':[3,10,30]
    }
]
grid_search = GridSearchCV(linearRegressor,param_grid=param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(data_prepared,outcome)
grid_search.best_params_

{'n_jobs': 3, 'positive': True}

In [14]:
# Разделяем данные для обучения и теста

from sklearn.model_selection import train_test_split 
data_train, data_test, outcome_train, outcome_test = train_test_split(data, outcome, test_size = 0.20)

In [15]:
# Метод опорных векторов
from sklearn.svm import SVC 
svclassifier = SVC(kernel='linear') 
svclassifier.fit(data_train, outcome_train)
y_pred = svclassifier.predict(data_test)

In [16]:
from sklearn.metrics import classification_report, confusion_matrix 
print(confusion_matrix(outcome_test,y_pred)) 
print(classification_report(outcome_test,y_pred))

[[87 15]
 [18 34]]
              precision    recall  f1-score   support

           0       0.83      0.85      0.84       102
           1       0.69      0.65      0.67        52

    accuracy                           0.79       154
   macro avg       0.76      0.75      0.76       154
weighted avg       0.78      0.79      0.78       154



In [17]:
# Метод k-ближайших соседей
from sklearn.neighbors import KNeighborsClassifier 
classifier = KNeighborsClassifier(n_neighbors=5) 
classifier.fit(data_train, outcome_train)
y_pred = classifier.predict(data_test)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix 
print(confusion_matrix(outcome_test,y_pred)) 
print(classification_report(outcome_test,y_pred))

[[81 21]
 [24 28]]
              precision    recall  f1-score   support

           0       0.77      0.79      0.78       102
           1       0.57      0.54      0.55        52

    accuracy                           0.71       154
   macro avg       0.67      0.67      0.67       154
weighted avg       0.70      0.71      0.71       154



In [19]:
# Случайный лес

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

def display_scores(scores):
    print("Оценки:",scores)
    print("Среднее:", scores.mean())
    print("Стандартное отклонение:",scores.std())

forest_model = RandomForestRegressor()
forest_model.fit(data_prepared,outcome)
forest_rmse = mean_squared_error(outcome, forest_model.predict(data_prepared))
print(np.sqrt(forest_rmse))
forest_rmse_scores = cross_val_score(model, data_prepared,outcome,scoring='neg_mean_squared_error')
display_scores(np.sqrt(-forest_rmse_scores))

0.1514573303497281
Оценки: [0.41135733 0.42463632 0.40665043 0.36951542 0.40149551]
Среднее: 0.4027310014822495
Стандартное отклонение: 0.018299834981313642


In [20]:
# Гиперпараметры случайного леса

from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'n_estimators':[3,10,30], 'max_features':[2,4,7]
    }, 
    {
        'bootstrap':[False],'n_estimators':[3,10,30],'max_features':[2,3,4]   
    }
]

regressor = RandomForestRegressor()
grid_search = GridSearchCV(regressor,param_grid=param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(data_prepared,outcome)

In [21]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}