In [2]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.ticker as ticker
%matplotlib inline

warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier,NeighborhoodComponentsAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [3]:
hcv =pd.read_csv('hcvdat0.csv')

In [4]:
mapping_cat = {'0=Blood Donor' : 0, '1=Hepatitis' : 1, '2=Fibrosis' : 2, '3=Cirrhosis' : 3}
mapping_sex = {'m' : 0, 'f' :1}
hcv = hcv.replace({'Category': mapping_cat})
hcv = hcv.replace({'Sex': mapping_sex})

In [5]:
hcv.isnull().sum()

Unnamed: 0     0
Category       0
Age            0
Sex            0
ALB            1
ALP           18
ALT            1
AST            0
BIL            0
CHE            0
CHOL          10
CREA           0
GGT            0
PROT           1
dtype: int64

### Remplacer valeurs NaN par la moyenne de la variable en question de la catégorie en question

In [6]:
warnings.filterwarnings('ignore')
blooddonors = hcv.loc[hcv['Category'] == 0]
hepatitis = hcv.loc[hcv['Category'] == 1]
fibrosis = hcv.loc[hcv['Category'] == 2]
cirrhosis = hcv.loc[hcv['Category'] == 3]
means_bd = []
means_hep = []
means_fib = []
means_cir = []
for i in range(4,len(hcv.columns)) :
    #print("%s : %d " % (hcv.columns[i], i-4))
    means_bd.append(hcv.loc[hcv['Category'] == 0][hcv.columns[i]].mean())
    means_hep.append(hcv.loc[hcv['Category'] == 1][hcv.columns[i]].mean())
    means_fib.append(hcv.loc[hcv['Category'] == 2][hcv.columns[i]].mean())
    means_cir.append(hcv.loc[hcv['Category'] == 3][hcv.columns[i]].mean())
    blooddonors[blooddonors.columns[i]] = blooddonors[blooddonors.columns[i]].fillna(value = means_bd[i-4])
    hepatitis[hepatitis.columns[i]] = hepatitis[hepatitis.columns[i]].fillna(value = means_hep[i-4])
    fibrosis[fibrosis.columns[i]] = fibrosis[fibrosis.columns[i]].fillna(value = means_fib[i-4])
    cirrhosis[cirrhosis.columns[i]] = cirrhosis[cirrhosis.columns[i]].fillna(value = means_cir[i-4])
frames = [blooddonors, hepatitis, fibrosis, cirrhosis]
hcv = pd.concat(frames)
hcv.isnull().sum()

Unnamed: 0    0
Category      0
Age           0
Sex           0
ALB           0
ALP           0
ALT           0
AST           0
BIL           0
CHE           0
CHOL          0
CREA          0
GGT           0
PROT          0
dtype: int64

### Création de la matrice des variables explicatives est le vecteur de la variable expliqué

In [7]:
Y = hcv.Category.astype('int')
X = hcv.iloc[:,2:]

### Equilibrer la taille du jeux de donnnées pour chaque catégorie oversampling avec SMOTE

In [8]:
smote = SMOTE()
x_smote, y_smote = smote.fit_sample(X, Y)

### Création du training set et du test set

In [9]:
x_train_smote, x_test_smote, y_train_smote, y_test_smote = train_test_split(x_smote, y_smote,test_size=0.25, random_state=30, shuffle = True)

# 1-KNN

In [10]:
pipeline1 = Pipeline([('scaler', StandardScaler()),('nca', NeighborhoodComponentsAnalysis()), ('knn', KNeighborsClassifier())])

### Différennts parametres du KNN pour le grid search

In [11]:
parameteres_knn = {
    'knn__n_neighbors':[1,2,3,4,5],
    'knn__algorithm':['auto','ball_tree','kd_tree','brute'],
    'knn__leaf_size':[5,20,25,30] }

### KNN grid search

In [12]:
grid = GridSearchCV(pipeline1, param_grid = parameteres_knn, cv = 5)
grid.fit(x_train_smote, y_train_smote)

KeyboardInterrupt: 

In [None]:
knn_predictions = grid.predict(x_test_smote)
#Accurary Score
knn_score = accuracy_score(y_test_smote, knn_predictions)
print(knn_score)
#confusion matrix
cm = metrics.confusion_matrix(y_test_smote, knn_predictions)
print(grid.best_params_)
#Classification Report
report = metrics.classification_report(y_test_smote, knn_predictions)
print(report)

### Matrice de confusion KNN

In [None]:
import seaborn as sns
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual Category');
plt.xlabel('Predicted Category');
all_sample_title = 'Accuracy Score: {0}'.format(knn_score)
plt.title(all_sample_title, size = 15);

# 2-SVM

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report

In [None]:
SVM = svm.SVC(kernel='linear', gamma='auto', C=2)
SVM.fit(x_train_smote, y_train_smote)
svm_y_predict = SVM.predict(x_test_smote)
print(classification_report(y_test_smote, svm_y_predict))

In [None]:
print(accuracy_score(y_test_smote, svm_y_predict))
pd.crosstab(y_test_smote, svm_y_predict)

### SVM avec cross validation

In [None]:
from sklearn.model_selection import cross_val_score
cross_SVM = svm.SVC(kernel='linear', gamma='auto', C=1)
scores = cross_val_score(cross_SVM, x_smote, y_smote, cv=5)

In [None]:
print(scores.mean())

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf', 'linear']} 

In [None]:
from sklearn.svm import SVC 
svm_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 

In [None]:
svm_grid.fit(x_train_smote, y_train_smote) 

In [None]:
print(svm_grid.best_params_) 

In [None]:
svm_predictions = svm_grid.predict(x_test_smote)
#Accurary Score
svm_grid_score = accuracy_score(y_test_smote, svm_predictions)
print(svm_grid_score)
#confusion matrix
cm_svm = metrics.confusion_matrix(y_test_smote, svm_predictions)
print(svm_grid.best_params_) 
#Classification Report
report = metrics.classification_report(y_test_smote, svm_predictions)
print(report)

### Test du SVM avec les meilleurs parametres obtenu avec le grid search

In [1]:
gridcross_svm = svm.SVC(kernel='rbf', gamma=0.0001, C=1)
scores4 = cross_val_score(gridcross_svm, x_smote, y_smote, cv=5)
print(scores4.mean())

NameError: name 'svm' is not defined

### Matrice de confusion SVM

In [None]:
import seaborn as sns
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual Category');
plt.xlabel('Predicted Category');
all_sample_title = 'Accuracy Score: {0}'.format(svm_grid_score)
plt.title(all_sample_title, size = 15);

# 3-Regression logistique

In [13]:
from sklearn.linear_model import LogisticRegression

In [14]:
penalty = ['l1', 'l2']

In [15]:
C = np.logspace(0, 4, 10)

In [16]:
hyperparameters = dict(C=C, penalty=penalty)

In [17]:
logisticRegr = LogisticRegression(multi_class='multinomial')

In [18]:
clf = GridSearchCV(logisticRegr, hyperparameters, cv=5, verbose=0)

In [19]:
best_model = clf.fit(x_train_smote, y_train_smote)

In [20]:
print('Best Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Best C:', best_model.best_estimator_.get_params()['C'])

Best Penalty: l2
Best C: 10000.0


In [21]:
lr_predictions = best_model.predict(x_test_smote)
lr_score = accuracy_score(y_test_smote, lr_predictions)
lr_cm = metrics.confusion_matrix(y_test_smote, lr_predictions)
lr_report = metrics.classification_report(y_test_smote, lr_predictions)
print(lr_score)
print(lr_report)

0.9230769230769231
              precision    recall  f1-score   support

           0       0.96      0.97      0.96       121
           1       0.82      0.92      0.87       131
           2       0.93      0.83      0.88       147
           3       0.99      0.99      0.99       134

    accuracy                           0.92       533
   macro avg       0.93      0.93      0.92       533
weighted avg       0.93      0.92      0.92       533



### Matrice de confusion Regression logistique

In [22]:
import seaborn as sns
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt="d", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual Category');
plt.xlabel('Predicted Category');
all_sample_title = 'Accuracy Score: {0}'.format(lr_score)
plt.title(all_sample_title, size = 15);

NameError: name 'cm' is not defined