In [1]:
import numpy as np
import pandas as pd
import time
import gc
import random
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
%load_ext autoreload
%autoreload 2
import tests as tests

!pip install sklearn

In [2]:
path = "data/pima-indians-diabetes.csv"
df = pd.read_csv(path)
x_data = df.iloc[:, 0:8]
y_data = df.iloc[:, 8]

In [3]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, shuffle=True, random_state=614)

In [4]:
lr = LinearRegression().fit(x_train, y_train)
y_predict_train = lr.predict(x_train)
y_predict_test = lr.predict(x_test)

In [5]:
y_pred = np.round(y_predict_train)
train_accuracy = accuracy_score(y_train, y_pred)
train_accuracy

0.7821229050279329

In [35]:
y_test_pred= []
for i in range(len(y_predict_test)):
    y_test_pred.append(int(y_predict_test[i]+0.5))


test_accuracy = accuracy_score(y_test, np.array(y_test_pred))
test_accuracy

0.7316017316017316

In [7]:
rf_clf = RandomForestClassifier(random_state=614).fit(x_train, y_train)
y_predict_train = rf_clf.predict(x_train)
y_predict_test = rf_clf.predict(x_test)

In [9]:
train_accuracy = accuracy_score(y_train, y_predict_train)
train_accuracy

1.0

In [10]:
test_accuracy = accuracy_score(y_test, y_test_pred)
test_accuracy

0.7272727272727273

In [12]:
feature_importance = rf_clf.feature_importances_
feature_importance

array([0.07481604, 0.25521095, 0.08551354, 0.07373347, 0.0754602 ,
       0.1630978 , 0.12729624, 0.14487176])

In [14]:
sorted_indices = np.argsort(rf_clf.feature_importances_)[::-1]
sorted_indices

array([1, 5, 7, 6, 2, 4, 0, 3], dtype=int64)

In [28]:
parameters = {'n_estimators':[4, 16, 256], 'max_depth':[2, 8, 16]}
gscv_rfc = GridSearchCV(rf_clf, parameters).fit(x_train,y_train)
gscv_rfc

GridSearchCV(estimator=RandomForestClassifier(random_state=614),
             param_grid={'max_depth': [2, 8, 16], 'n_estimators': [4, 16, 256]})

In [29]:
best_params = gscv_rfc.best_params_
best_params

{'max_depth': 8, 'n_estimators': 256}

In [30]:
best_score = gscv_rfc.best_score_
best_score

0.7858255451713395

In [21]:
x_train

Unnamed: 0,x1,x2,x3,x4,x5,x6,x7,x8
237,0,179,90,27,0,44.1,0.686,23
342,1,0,68,35,0,32.0,0.389,22
351,4,137,84,0,0,31.2,0.252,30
42,7,106,92,18,0,22.7,0.235,48
165,6,104,74,18,156,29.9,0.722,41
...,...,...,...,...,...,...,...,...
430,2,99,0,0,0,22.2,0.108,23
399,3,193,70,31,0,34.9,0.241,25
397,0,131,66,40,0,34.3,0.196,22
540,8,100,74,40,215,39.4,0.661,43


In [26]:
scaled_x_train

StandardScaler()

In [27]:
scaled_x_train = StandardScaler().fit(x_train).transform(x_train)
scaled_x_test = StandardScaler().fit(x_test).transform(x_test)
scaled_x_test

array([[-0.58123325,  1.28081534,  1.10989737, ...,  0.07224153,
         1.01354675,  2.63811818],
       [ 2.64346913,  1.11593106,  1.00537789, ...,  1.23201092,
         2.14454695,  0.42716103],
       [-0.87438801, -0.79672653,  0.2737415 , ..., -0.66227909,
         0.60896132,  0.18149913],
       ...,
       [-1.16754278, -0.56588854,  0.48278047, ...,  0.445945  ,
        -0.72433159, -0.80114849],
       [-0.87438801,  1.77546816,  0.2737415 , ...,  0.74233051,
        -1.18408777,  0.34527373],
       [-0.87438801,  0.78616251,  0.90085841, ..., -0.12105337,
         1.28020534, -0.8830358 ]])

In [38]:
svm = SVC(gamma='auto').fit(scaled_x_train, y_train)
y_predict_train = svm.predict(scaled_x_train)
y_predict_test = svm.predict(scaled_x_test)

In [41]:
svm_parameters = {'kernel':('linear', 'rbf'), 'C':[0.01, 0.1, 1.0]}
svm_cv = GridSearchCV(svm, svm_parameters, n_jobs = -1, return_train_score = True).fit(scaled_x_train, y_train)
best_score = svm_cv.best_score_
best_score

0.7820526133610246

In [56]:
svm_cv.cv_results_['mean_test_score']

array([0.77826237, 0.63501211, 0.782018  , 0.76341295, 0.78205261,
       0.78033922])

In [57]:
pca = PCA(n_components=8, svd_solver='full').fit(x_data)

In [58]:
pca.explained_variance_ratio_

array([8.88546635e-01, 6.15907837e-02, 2.57901189e-02, 1.30861374e-02,
       7.44093864e-03, 3.02614919e-03, 5.12444875e-04, 6.79264301e-06])

In [59]:
pca.singular_values_

array([3212.6611207 ,  845.82919167,  547.33280231,  389.87962763,
        293.9941346 ,  187.48648707,   77.15221185,    8.88268374])

In [28]:
int(0.5)

0

In [29]:
int(0.99)

0