In [1]:
import pandas
pima = pandas.read_table("pima.txt",sep="\t",header=0)

print("Dimensions: ")
print(pima.shape)

print("\nColonnes:")
print(pima.columns)

print("\nTypes")
print(pima.dtypes)

Dimensions: 
(768, 9)

Colonnes:
Index(['pregnant', 'diastolic', 'triceps', 'bodymass', 'pedigree', 'age',
       'plasma', 'serum', 'diabete'],
      dtype='object')

Types
pregnant       int64
diastolic      int64
triceps        int64
bodymass     float64
pedigree     float64
age            int64
plasma         int64
serum          int64
diabete       object
dtype: object


In [2]:
data = pima.values

X = data[:,0:8]

y = data[:,8]

from sklearn import model_selection

X_app,X_test,y_app,y_test = model_selection.train_test_split(X,y,test_size = 300,random_state=0)
print(X_app.shape,X_test.shape,y_app.shape,y_test.shape)

(468, 8) (300, 8) (468,) (300,)


In [3]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')

modele = lr.fit(X_app,y_app)

print(modele.coef_,modele.intercept_)

[[ 8.75153769e-02 -1.59511103e-02  1.70428483e-03  5.18609374e-02
   5.34696503e-01  1.24335202e-02  2.40115458e-02 -2.91586161e-04]] [-5.13527961]


In [4]:
y_pred = modele.predict(X_test)

from sklearn import metrics

cm = metrics.confusion_matrix(y_test,y_pred)
print(cm)

acc = metrics.accuracy_score(y_test,y_pred)
print("\nTaux de succès:")
print(acc)

err = 1.0 - acc
print("\nTaux d'erreur:")
print(err)

se = metrics.recall_score(y_test,y_pred,pos_label='positive')
print("\nSensibilité:")
print(se)

[[184  17]
 [ 45  54]]

Taux de succès:
0.7933333333333333

Taux d'erreur:
0.20666666666666667

Sensibilité:
0.5454545454545454


In [5]:
def specificity(y,y_hat):
    
    mc = metrics.confusion_matrix(y,y_hat)

    import numpy
    res = mc[0,0]/numpy.sum(mc[0,:])
    return res

specificite = metrics.make_scorer(specificity,greater_is_better=True)

sp = specificite(modele,X_test,y_test)
print(sp) # 0.915 = 184 / (184 + 17)

0.9154228855721394


In [6]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')

modele_all = lr.fit(X,y)

print(modele_all.coef_,modele_all.intercept_)

from sklearn import model_selection

succes = model_selection.cross_val_score(lr,X,y,cv=10,scoring='accuracy')

print(succes)

print(succes.mean())

[[ 1.17087631e-01 -1.68947770e-02  7.46053001e-04  5.97221654e-02
   6.81392866e-01  7.21999666e-03  2.83788475e-02 -6.42978367e-04]] [-5.88988049]
[0.74025974 0.75324675 0.79220779 0.72727273 0.74025974 0.74025974
 0.81818182 0.79220779 0.73684211 0.82894737]
0.7669685577580314


In [7]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')

modele = lr.fit(X_app,y_app)

probas = lr.predict_proba(X_test)

score = probas[:,1]

pos = pandas.get_dummies(y_test).values

pos = pos[:,1]

import numpy
npos = numpy.sum(pos)

index = numpy.argsort(score)

index = index[::-1]

sort_pos = pos[index]

cpos = numpy.cumsum(sort_pos)

rappel = cpos/npos

n = y_test.shape[0]

taille = numpy.arange(start=1,stop=301,step=1)

taille = taille / n

import matplotlib.pyplot as plt

plt.title('Courbe de gain')
plt.xlabel('Taille de cible')
plt.ylabel('Rappel')

plt.xlim(0,1)
plt.ylim(0,1)

plt.scatter(taille,taille,marker='.',color='blue')

plt.scatter(taille,rappel,marker='.',color='red')

plt.show()

<Figure size 640x480 with 1 Axes>

In [15]:
from sklearn import model_selection
from sklearn import svm

mvs = svm.SVC(gamma='auto')
parametres = [{'C':[0.1,1,10],'kernel':['rbf','linear']}]

grid = model_selection.GridSearchCV(estimator=mvs,param_grid=parametres,scoring='accuracy')

grille = grid.fit(X_app,y_app)

print(pandas.DataFrame.from_dict(grille.cv_results_).loc[:,["params","mean_test_score"]])

print("\nMeileur paramétrage:")
print(grille.best_params_)

print("\nMeileure performance:")
print(grille.best_score_)

y_pred3 = grille.predict(X_test)

print("\nSuccès:")
print(metrics.accuracy_score(y_test,y_pred3))



                           params  mean_test_score
0     {'C': 0.1, 'kernel': 'rbf'}         0.638889
1  {'C': 0.1, 'kernel': 'linear'}         0.752137
2       {'C': 1, 'kernel': 'rbf'}         0.638889
3    {'C': 1, 'kernel': 'linear'}         0.747863
4      {'C': 10, 'kernel': 'rbf'}         0.638889
5   {'C': 10, 'kernel': 'linear'}         0.756410

Meileur paramétrage:
{'C': 10, 'kernel': 'linear'}

Meileure performance:
0.7564102564102564

Succès:
0.7833333333333333


In [14]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(solver='liblinear')

from sklearn.feature_selection import RFE
selecteur = RFE(estimator=lr)

sol = selecteur.fit(X_app,y_app)

print("\nNb de valeurs:")
print(sol.n_features_)

print("\nVariables:")
print(sol.support_)

print("\nOrdre de suppression:")
print(sol.ranking_) 

X_new_app = X_app[:,sol.support_]

modele_sel = lr.fit(X_new_app,y_app)

X_new_test = X_test[:,sol.support_]

y_pred_sel = modele_sel.predict(X_new_test)

print("\nSuccès après réévaluation:")
print(metrics.accuracy_score(y_test,y_pred_sel))



Nb de valeurs:
4

Variables:
[ True False False  True  True False  True False]

Ordre de suppression:
[1 2 4 1 1 3 1 5]

Succès après réévaluation:
0.7866666666666666
