In [None]:
import pandas as pd
import numpy as np
import model
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
from sklearn.tree import plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.features import Rank2D


In [None]:
sqlForTraining = """
select
	eta.idetablissement,
	eta.departement,
	eta.siren,
	eta.geores_lat,
	eta.geores_lon,
	eta.nb_agrements,
	eta.nb_inspections,
	CAST (eta.moy_score*10 AS INTEGER) as moy_score,
	eta.commune_norm,
	act.idactivite,
	act.categorie_frais,
	CASE 
      WHEN ins.synthese_eval = 'Très satisfaisant'  THEN 4
      WHEN ins.synthese_eval = 'Satisfaisant'  THEN 3
      WHEN ins.synthese_eval = 'A améliorer'  THEN 2
      WHEN ins.synthese_eval = 'A corriger de manière urgente'  THEN 1
	END	as synthese_eval
from inspection ins
join etablissement eta on ins.idetablissement = eta.idetablissement
join activite act on ins.idactivite = act.idactivite
order by eta.idetablissement
"""

In [None]:
df = pd.read_sql_query(sqlForTraining, model.session.connection())

df['moy_score'].astype('int')

#df = df.sample(frac=1).reset_index(drop=True)

df.info()

In [None]:
X = df[[ 'commune_norm', 'geores_lat', 'geores_lon', 'siren', 'categorie_frais', 'nb_agrements', 'nb_inspections', 'departement', 'idactivite']]
X['categorie_frais'] = X['categorie_frais'].astype('int')

# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
X['siren'] = labelencoder.fit_transform(X['siren'])
X['commune_norm'] = labelencoder.fit_transform(X['commune_norm'])

y = df['synthese_eval']

In [None]:
print(np.unique(y, return_counts=True))

In [None]:
# X.info()

In [None]:
r2D = Rank2D(algorithm='pearson')
r2D.fit(X)
r2D.transform(X)
r2D.show()

In [None]:
# print(np.corrcoef(X, rowvar=False))

# sns.pairplot(X)

In [None]:
# sns.pairplot(X[['geores_lat', 'geores_lon']])


In [None]:
from yellowbrick.target import FeatureCorrelation
fc = FeatureCorrelation(method='mutual_info-regression', sort=True)
fc.fit(X, y, discrete_features=False)
fc.show()

In [None]:
# facilite entrainement et calcule des métrics ==> vu dans vid Ex corrigé Random Forest avec Sklearn de "EpiMed Open Source" : https://www.youtube.com/watch?v=ydk1mUqwsVQ
def calculate_accuracy(classifier, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train)
    y_pred_train = classifier.predict(X_train)
    y_pred_test = classifier.predict(X_test)
    accuracy_train = metrics.accuracy_score(y_train, y_pred_train)
    accuracy_test = metrics.accuracy_score(y_test, y_pred_test)
    print("Train accuracy", "{:.3f}".format(accuracy_train), "Test accuracy","{:.3f}".format(accuracy_test))
    return accuracy_train, accuracy_test, classifier

In [None]:
# Split du dataset pour train et test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42,stratify=y)

In [None]:
#vérification du split de la valeur y en %age
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

In [None]:
# Vérification répartition des échantillons
print ("Train : ", X_train.shape, "Test : ", X_test.shape)

In [None]:
classifier_tree = DecisionTreeClassifier(random_state=42, class_weight="balanced")
classifier_tree.get_params()

In [None]:
classifier_tree.fit(X_train,y_train)
calculate_accuracy(classifier_tree, X_train, X_test, y_train, y_test)

In [None]:
plt.figure(figsize=(10,10), dpi=150)
plot_tree(classifier_tree, feature_names=list(X_train.columns), filled=True)

In [None]:
clfConfusion = ConfusionMatrix(classifier_tree)
clfConfusion.score(X_test, y_test)
clfConfusion.show()

In [None]:
y_pred = classifier_tree.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
classifier_tree_2 = DecisionTreeClassifier(max_depth=30,random_state=42, class_weight="balanced")
classifier_tree_2.get_params()
classifier_tree_2.fit(X_train,y_train)
calculate_accuracy(classifier_tree_2, X_train, X_test, y_train, y_test)

In [None]:
y_pred_2 = classifier_tree_2.predict(X_test)
print(classification_report(y_test, y_pred_2))

In [None]:
clfConfusion = ConfusionMatrix(classifier_tree_2)
clfConfusion.score(X_test, y_test)
clfConfusion.show()

In [None]:
plt.figure(figsize=(10,10), dpi=150)
plot_tree(classifier_tree_2, feature_names=list(X_train.columns), filled=True)