# Projet 5: Catégorisation automatique des questions

## Modèle final

In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
import joblib

In [21]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',None)

In [22]:
df_cleaned = pd.read_csv("/home/alseny/Documents/Proje5_NLP/Donnees/QueryResultCleaneed.csv")
df_cleaned.head()

Unnamed: 0,Tags,Posts,PostsClean
0,['c#'],How to convert Decimal to Double in C#? I want...,convert decimal double want assign decimal var...
1,"['c#', '.net']",How do I calculate someone's age based on a Da...,calculate someone based datetime type birthday...
2,['c#'],Calculate relative time in C# Given a specific...,calculate relative time given specific value d...
3,['.net'],Difference between Math.Floor() and Math.Trunc...,difference mathfloor mathtruncate difference
4,['c#'],Filling a DataSet or a DataTable from a LINQ q...,filling dataset datatable linq query result ex...


In [23]:
df_clean = df_cleaned.copy()

In [24]:
df_clean.Tags[0]

"['c#']"

In [25]:
import ast
df_clean.Tags = df_clean.Tags.apply(lambda x : ast.literal_eval(x))

## Quelques paires de publications

In [26]:
def print_plot(index):
    example = df_clean[df_clean.index == index][['PostsClean', 'Tags']].values[0]
    if len(example) > 0:
        print(example[0])
        print('Tag:', example[1])
print_plot(9)

distinct ordered list name datatable using linq column want generate collection unique name ordered alphabetically following query ignores order clause
 
why enforced
Tag: ['c#']


In [27]:
print_plot(8)

best allow plugins application starting application time around want create something people extend using plugin interface 
how writing hook code plugins attach specific event
Tag: ['php']


In [28]:
df_clean.drop(["Posts"], axis=1, inplace=True)## Suppression de la variable "Posts"

In [29]:
df_clean = df_clean[0:600]

In [30]:
df_clean.shape

(600, 2)

In [31]:
df_clean.head()

Unnamed: 0,Tags,PostsClean
0,[c#],convert decimal double want assign decimal var...
1,"[c#, .net]",calculate someone based datetime type birthday...
2,[c#],calculate relative time given specific value d...
3,[.net],difference mathfloor mathtruncate difference
4,[c#],filling dataset datatable linq query result ex...


## TF-IDF

In [32]:
#conda deactivate
#!pip install tensorflow

In [33]:
X = df_clean['PostsClean']

y = df_clean['Tags']

multilabel_binarizer = MultiLabelBinarizer()

y_target = multilabel_binarizer.fit_transform(y)

# Split into Training and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_target, test_size=0.3, random_state=42)

In [34]:
y_train.shape

(420, 10)

In [35]:
# Initizalize the vectorizer with max nr words and ngrams (1: single words, 2: two words in a row)
vectorizer_tfidf = TfidfVectorizer(max_features=500, ngram_range=(1,2))

## SGDClassifier

In [37]:
#Instanciation du modèle
sgd_classifier = MultiOutputClassifier(SGDClassifier())

model_sgd = Pipeline([("vectorizer", vectorizer_tfidf), ("classifier", sgd_classifier)])

model_sgd.fit(X_train, y_train)

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(max_features=500, ngram_range=(1, 2))),
                ('classifier',
                 MultiOutputClassifier(estimator=SGDClassifier()))])

In [38]:
predicted_train_tfidf = model_sgd.predict(X_train)
accuracy_train_tfidf = accuracy_score(y_train, predicted_train_tfidf)
print('Accuracy Training data: {:.1%}'.format(accuracy_train_tfidf))

predicted_test_tfidf = model_sgd.predict(X_test)
accuracy_test_tfidf = accuracy_score(y_test, predicted_test_tfidf)
print('Accuracy Test data: {:.1%}'.format(accuracy_test_tfidf))
#print('Training time: {:.1f}s'.format(training_time_tfidf))
print(classification_report(y_test, predicted_test_tfidf, target_names= ['.net','asp.net','c#','c++','java', 'javascript','php','python','sql','sql-server']))

Accuracy Training data: 99.8%
Accuracy Test data: 31.7%
              precision    recall  f1-score   support

        .net       0.33      0.41      0.36        39
     asp.net       0.60      0.86      0.71        14
          c#       0.49      0.43      0.46        49
         c++       0.30      0.12      0.17        25
        java       0.65      0.76      0.70        17
  javascript       0.80      0.67      0.73        12
         php       0.45      0.29      0.36        17
      python       0.88      0.78      0.82         9
         sql       0.64      0.41      0.50        17
  sql-server       0.78      0.75      0.77        24

   micro avg       0.54      0.49      0.51       223
   macro avg       0.59      0.55      0.56       223
weighted avg       0.53      0.49      0.50       223
 samples avg       0.48      0.52      0.48       223



  _warn_prf(average, modifier, msg_start, len(result))


## Exemple de prediction sur un texte

In [39]:
ex = "best allow plugins application starting application time around want create something people extend using plugin interface how writing hook code plugins attach specific event"

In [40]:
model_sgd.predict([ex])

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [41]:
multilabel_binarizer.inverse_transform(model_sgd.predict([ex]))

[('php',)]

# Sauvegarde du modèle

In [24]:
import pickle as pkl

In [43]:
# saving the model 
pipline_file = open("TagsPredict.pkl", "wb") 
joblib.dump(model_sgd, pipline_file) 
pipline_file.close()

## Lien vers l'application

https://tagsprediction-bhnyhaye963vexcgjrbakj.streamlit.app/