In [None]:
import pandas as pd
import seaborn as sns 

### Lecture de données

In [None]:
df=pd.read_csv('breast-cancer-wisconsin.data.txt', header=None)

In [None]:
df.head()

### Ajout des noms de colonnes

In [None]:
col_names = ['Id', 'Clump_thickness', 'Uniformity_Cell_Size', 'Uniformity_Cell_Shape', 'Marginal_Adhesion', 
             'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses', 'Class']


In [None]:
df.columns = col_names
df.columns

### Description du dataset

In [None]:
df.describe()

In [None]:
df.head()

### Supprimer la colonne ID

In [None]:

df.drop('Id', axis=1, inplace=True)


In [None]:
df.head()

### Info sur dataset pour afficher le type de données

In [None]:
df.info()

### Convertir Bare_Nuclei en numérique

In [None]:
#If ‘coerce’, then invalid parsing will be set as NaN.
df['Bare_Nuclei'] = pd.to_numeric(df['Bare_Nuclei'],errors='coerce')

In [None]:
df.dtypes

### Chercher les valeurs null

In [None]:
df.isnull().sum()

In [None]:
# distribution des valeurs

df['Bare_Nuclei'].value_counts()

### <h2 class='text-danger'>Diviser dataset en X (caractéristiques) et y (classes)</h2>

In [None]:
X = df.drop(['Class'], axis=1)
y = df['Class']

In [None]:
X.head()

## Remplacement des val NaN par la médiane du colonne

In [None]:
for col in X:
    col_mediane = X[col].median()
    X[col].fillna(col_mediane,inplace=True)

In [None]:
X.isnull().sum()

## Découper dataset en 20% pour test et 80% pour traning

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,train_size=0.8 )

In [None]:
x_train.head()

# Normalisation

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)

x_test = scaler.transform(x_test)

## Training en utilisant Naive Bayes Gaussien

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb = GaussianNB()

In [None]:
y_train.value_counts()

In [None]:
nb.fit(x_train,y_train)

In [None]:
y_train.shape

In [None]:
y_pred = nb.predict(x_test)

## Evaluation du modèle

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
y_test.value_counts()

In [None]:
cfx=confusion_matrix(y_test,y_pred)
cfx

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))

## Visualisation de la matrice de confusion

In [None]:
sns.heatmap(cfx, fmt='d',annot=True,cmap='Reds')

In [None]:
sns.heatmap(cfx/cfx.sum(), fmt='.2%',annot=True,cmap='Greens' )

# Regression Logistique

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
y_pred_lr = lr.predict(x_test)

In [None]:
cfx_lr=confusion_matrix(y_test,y_pred_lr)
cfx_lr

In [None]:
sns.heatmap(cfx_lr, fmt='d',annot=True,cmap='Greens')

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_lr)))

# SVM

In [None]:
from sklearn.svm import SVC

In [None]:
svm = SVC()
svm.fit(x_train,y_train)

In [None]:
y_pred_svm = svm.predict(x_test)

In [None]:
cfx_svm=confusion_matrix(y_test,y_pred_svm)
cfx_svm

In [None]:
sns.heatmap(cfx_svm, fmt='d',annot=True,cmap='Greens')

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_svm)))

# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier


In [None]:
knn = KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(x_train, y_train)

In [None]:
y_pred_knn = knn.predict(x_test)

In [None]:
y_pred_knn

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred_knn)))