In [None]:
import sklearn
import numpy as np
from glob import glob
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
categories=['ABSENCES','AVANTAGES','DEPART DE ENTREPRISE','OUTILS RH','PAIE, REMUNERATION ET FRAIS PROFESSIONNELS','THEME AUTRE']

In [None]:
docs_to_train = sklearn.datasets.load_files("document", 
    description=None, categories=categories, 
    load_content=True, encoding='utf-8', shuffle=True, random_state=42)
docs_to_train .target_names
len(docs_to_train.data)
len(docs_to_train.filenames)

In [None]:
print("\n".join(docs_to_train.data[0].split("\n")[:1]))
print(docs_to_train.target_names[docs_to_train.target[0]])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(docs_to_train.data,
    docs_to_train.target, test_size=0.3,random_state=4)
#print(X_train)

In [None]:
# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
# Construct the classifier pipeline using a SGDClassifier algorithm

print ('\nApplying the classifier...\n')

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),

                     ('tfidf', TfidfTransformer(use_idf=True)),

                     ('clf', SGDClassifier(loss='hinge', penalty='l2',

                                           alpha=1e-3, random_state=42, verbose=1)),

])

In [None]:
#count_vect = CountVectorizer(stop_words='english')
#X_train_counts = count_vect.fit_transform(raw_documents=X_train)

#tfidf_transformer = TfidfTransformer(use_idf=True)
#X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
#X_train_counts.shape
#print(X_train_counts)

In [None]:
text_clf.fit(X_train, y_train)

In [None]:
predicted = text_clf.predict(X_test)

In [None]:
print (np.mean(predicted == y_test))

In [None]:
print(metrics.classification_report(y_test, predicted, 
    target_names=docs_to_train.target_names))

In [None]:
docs_to_train.target_names[text_clf.predict(['je vais etre absence aujourdhui'])[0]]

In [None]:
#no
#model ready to deploy  in joblib format 
#from sklearn.externals import joblib
#joblib.dump(text_clf, 'model.joblib')

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}


In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=2, iid=False, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(docs_to_train.data[:400], docs_to_train.target[:400])

In [None]:
docs_to_train.target_names[gs_clf.predict(['je voudrais savoir combien de jour de congé on va prendre cette noel'])[0]]
#docs_to_train.target_names[gs_clf.predict(['Où trouve t on le nom de notre juriste dans la base documentaire ?'])[0]]

In [None]:
docs_to_train.target_names[gs_clf.predict(['A  créé 3 salariés dans Gershwin mais ces personnes ne viennent plus en France comment les annuler ?A fait une annulation d embauche mais ressortent malgré tout dans les effectifs RSI correction  répond qu ils ne sont pas habilités à faire ces suppressions'])[0]]

In [None]:
docs_to_train.target_names[gs_clf.predict(['Un salarié en congé parentale non rémunéré jusqu souhaite revenir en date du. cela est - il possible?'])[0]]

In [None]:
docs_to_train.target_names[gs_clf.predict(['Jai créé expression de besoin AJ000049 Elle a bien le statut diffusée mais  pas eu de confirmation par mail et les ETT ont pas recu expression de besoin'])[0]]

In [None]:

gs_clf.best_score_                                  

for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    
  

In [None]:
#some other grid search parameter
gs_clf.cv_results_

In [None]:
#y_true,y_test=y_test.gs_clf.predict(X_test)
#print (classification_report(y_true,y_pred))

In [None]:
#new way to grid search
from sklearn.model_selection import GridSearchCV
k_range=range(1,31)
print (k_range)

In [None]:
param_grid=dict(n_neighbors=k_range)
print(param_grid)

In [None]:
grid=GridSearchCV(text_clf,param_grid,cv=2,scoring='accuracy')

In [None]:
grid.fit(docs_to_train.data[:400],docs_to_train.target[:400])

In [None]:
grid.grid_scores_