In [1]:
import pandas as pd
import numpy as np
import model
import utils
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
sqlForTraining = """
select
	ins.idinspection,
	eta.idetablissement,
	eta.departement,
	eta.siren,
	eta.geores_lat,
	eta.geores_lon,
	eta.nb_agrements,
	eta.nb_inspections,
	CAST (eta.moy_score*10 AS INTEGER) as moy_score,
	eta.commune_norm,
	act.idactivite,
	act.categorie_frais,
	CASE 
      WHEN ins.synthese_eval = 'Très satisfaisant'  THEN 4
      WHEN ins.synthese_eval = 'Satisfaisant'  THEN 3
      WHEN ins.synthese_eval = 'A améliorer'  THEN 2
      WHEN ins.synthese_eval = 'A corriger de manière urgente'  THEN 1
	END	as synthese_eval
from inspection ins
join etablissement eta on ins.idetablissement = eta.idetablissement
join activite act on ins.idactivite = act.idactivite
order by eta.idetablissement
"""

In [3]:
df = pd.read_sql_query(sqlForTraining, model.session.connection())

In [4]:
df = utils.fillWords(df)

df['moy_score'].astype('int')

df = df.sample(frac=1).reset_index(drop=True)

In [5]:
df.sample(5)

Unnamed: 0,idinspection,idetablissement,departement,siren,geores_lat,geores_lon,nb_agrements,nb_inspections,moy_score,commune_norm,...,word04,word05,word06,word07,word08,word09,word10,word11,word12,word13
12070,21141,17360,63,751785429,45.343664,3.040339,1,2,40,anzat le luguet,...,volaille,maison,ourcq,lauze,,,,,,
18218,6889,5940,16,344695226,45.958616,0.2632,1,2,35,poursac,...,bouheret,hougue,pignol,cudraz,ecole,earl,,,,
26491,4471,4108,94,311799456,48.748808,2.3939,1,1,40,orly,...,viandes,laitiers,centre,vert,chaine,yvan,carnes,,,
24817,14947,12280,72,479718470,48.122579,-0.016725,0,1,30,conlie,...,ordener,produits,clemenceau,caro,carnes,,,,,
30216,25870,21473,60,814124830,49.416348,2.826367,0,2,30,compiegne,...,bonheur,sarl,volvic,,,,,,,


In [None]:
df[['word01', 'word02', 'word03', 'word04', 'word05', 'word06', 'word07', 'word08', 'word09', 'word10', 'word11', 'word12', 'word13']] = df[['word01', 'word02', 'word03', 'word04', 'word05', 'word06', 'word07', 'word08', 'word09', 'word10', 'word11', 'word12', 'word13']].fillna('')

In [None]:
X = df[[ 'geores_lat', 'geores_lon', 'categorie_frais', 'nb_agrements', 'nb_inspections', 'departement', 'idactivite', 'word01', 'word02', 'word03', 'word04', 'word05', 'word06', 'word07', 'word08', 'word09', 'word10', 'word11', 'word12', 'word13']]
X['categorie_frais'] = X['categorie_frais'].astype('int')


In [None]:
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
X['word01'] = labelencoder.fit_transform(X['word01'])
X['word02'] = labelencoder.fit_transform(X['word02'])
X['word03'] = labelencoder.fit_transform(X['word03'])
X['word04'] = labelencoder.fit_transform(X['word04'])
X['word05'] = labelencoder.fit_transform(X['word05'])
X['word06'] = labelencoder.fit_transform(X['word06'])
X['word07'] = labelencoder.fit_transform(X['word07'])
X['word08'] = labelencoder.fit_transform(X['word08'])
X['word09'] = labelencoder.fit_transform(X['word09'])
X['word10'] = labelencoder.fit_transform(X['word10'])
X['word11'] = labelencoder.fit_transform(X['word11'])
X['word12'] = labelencoder.fit_transform(X['word12'])
X['word13'] = labelencoder.fit_transform(X['word13'])


In [None]:
y = df['synthese_eval']

In [None]:
X.sample(5)

In [None]:
XTrain, XTest, yTrain, yTest = train_test_split(X, y, train_size=0.8, random_state=42, stratify=y)

In [None]:
XTrain.sample(5)

In [None]:
XTest.sample(5)

In [None]:
from sklearn.preprocessing import StandardScaler
stdSc = StandardScaler(with_mean=False)
ZTrain = stdSc.fit_transform(XTrain)
ZTest = stdSc.transform(XTest)

In [None]:
from xgboost import XGBClassifier

reg = XGBClassifier(
    learning_rate =0.01,
    n_estimators=1000,
    max_depth=20,
    nthread=6,
    verbosity=0
    )

reg.fit(ZTrain, yTrain)

yPred = reg.predict(ZTest)

In [None]:
from yellowbrick.model_selection import FeatureImportances
clfFI = FeatureImportances(reg, relative=False, labels=XTrain.columns)
clfFI.fit(ZTrain, yTrain)
clfFI.show()

In [None]:
from yellowbrick.classifier import ConfusionMatrix
clfConfusion = ConfusionMatrix(reg)
clfConfusion.score(ZTest, yTest)
clfConfusion.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(yTest, yPred))