In [138]:
import pandas as pd
import numpy as np
from ast import literal_eval
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans

from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import hamming_loss
from sklearn.metrics import f1_score,precision_score,recall_score
from sklearn import metrics

import joblib
import matplotlib.pyplot as plt
from scipy.sparse import hstack
import dill as pickled

In [34]:
df = pd.read_csv('../StackSample_kaggle.csv', encoding="utf-8", sep=';')

In [35]:
df.head()

Unnamed: 0.1,Unnamed: 0,ï»¿,Title,Body,Tags
0,0,1,good branch merge tutorials tortoisesvn,really good tutorials explain branch merge apa...,['svn']
1,1,2,asp.net site map,anyone get experience create sql-based asp.net...,"['sql', 'asp.net']"
2,2,3,function create color wheel,something pseudo-solved many time never quite ...,['algorithm']
3,3,4,add script functionality .net applications,little game write c use database back-end trad...,"['c#', '.net']"
4,4,5,use nest class case,work collection class use video playback recor...,"['c++', 'oop', 'class']"


In [36]:
df['Question'] = df['Title'] + df['Title'] + df['Title'] + df['Body'] 

In [38]:
df['Question']

0        good branch merge tutorials tortoisesvngood br...
1        asp.net site mapasp.net site mapasp.net site m...
2        function create color wheelfunction create col...
3        add script functionality .net applicationsadd ...
4        use nest class caseuse nest class caseuse nest...
                               ...                        
58253    dramatic speed drop access static ram cache c+...
58254    social framework watchkitsocial framework watc...
58255    pass parameters template event meteorpass para...
58256    msdtc server erver unavailablemsdtc server erv...
58257    automatically implement classautomatically imp...
Name: Question, Length: 58258, dtype: object

In [39]:
df['Tags'] = df['Tags'].apply(literal_eval)

In [40]:
df['tags'] = df['Tags'].apply(lambda x: " ".join(x))

In [41]:
df['tags']

0                  svn
1          sql asp.net
2            algorithm
3              c# .net
4        c++ oop class
             ...      
58253              c++
58254      objective-c
58255       javascript
58256       sql-server
58257          haskell
Name: tags, Length: 58258, dtype: object

In [144]:
count_vectorizer = CountVectorizer(tokenizer = text_splitter, binary='true')
multilabel_y = count_vectorizer.fit_transform(df['tags'])

In [145]:
multilabel_y

<58258x100 sparse matrix of type '<class 'numpy.int64'>'
	with 91838 stored elements in Compressed Sparse Row format>

In [127]:
joblib.dump(count_vectorizer, 'count_vectorizer.joblib')

['count_vectorizer.joblib']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(df['Question'], multilabel_y, test_size = 0.2, random_state = 0)

In [115]:
def text_splitter(text):
    return text.split()

In [128]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", tokenizer = text_splitter, sublinear_tf=False, ngram_range=(1,2))

In [129]:
x_train_multilabel = vectorizer.fit_transform(X_train.astype('U'))

In [131]:
joblib.dump(vectorizer, 'vectorizer.joblib')

['vectorizer.joblib']

In [None]:
x_test_multilabel = vectorizer.transform(X_test.astype('U'))

In [62]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

Dimensions of train data X: (46606, 111133) Y : (46606, 100)
Dimensions of test data X: (11652, 111133) Y: (11652, 100)


In [74]:
def metrics_report(y_test, predictions):
    print("Accuracy :",metrics.accuracy_score(y_test, predictions))
    print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


    precision = precision_score(y_test, predictions, average='micro')
    recall = recall_score(y_test, predictions, average='micro')
    f1 = f1_score(y_test, predictions, average='micro')

    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = precision_score(y_test, predictions, average='macro')
    recall = recall_score(y_test, predictions, average='macro')
    f1 = f1_score(y_test, predictions, average='macro')

    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    print (metrics.classification_report(y_test, predictions))

In [79]:
sgd_classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
sgd_classifier.fit(x_train_multilabel, y_train)
predictions = sgd_classifier.predict (x_test_multilabel)
metrics_report(y_test, predictions)

Accuracy : 0.35264332303467216
Hamming loss  0.009794026776519053
Micro-average quality numbers
Precision: 0.8016, Recall: 0.5039, F1-measure: 0.6188
Macro-average quality numbers
Precision: 0.7152, Recall: 0.4286, F1-measure: 0.5235
              precision    recall  f1-score   support

           0       0.56      0.20      0.29       522
           1       0.64      0.47      0.54        78
           2       0.81      0.37      0.51       152
           3       0.96      0.81      0.88       933
           4       0.98      0.83      0.90       145
           5       0.71      0.18      0.29        56
           6       0.61      0.41      0.49       143
           7       0.64      0.28      0.39       253
           8       0.72      0.53      0.61       176
           9       0.50      0.27      0.35        67
          10       0.81      0.46      0.58        83
          11       0.77      0.42      0.54       347
          12       0.83      0.56      0.67      1348
         

  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
pa_classifier = OneVsRestClassifier(PassiveAggressiveClassifier(loss='log'), n_jobs=-1)
pa_classifier.fit(x_train_multilabel, y_train)
predictions = pa_classifier.predict (x_test_multilabel)
metrics_report(y_test, predictions)

Accuracy : 0.3631994507380707
Hamming loss  0.009965671129419842
Micro-average quality numbers
Precision: 0.7605, Recall: 0.5375, F1-measure: 0.6299
Macro-average quality numbers
Precision: 0.6927, Recall: 0.4603, F1-measure: 0.5436
              precision    recall  f1-score   support

           0       0.40      0.24      0.30       522
           1       0.63      0.47      0.54        78
           2       0.79      0.47      0.59       152
           3       0.95      0.85      0.90       933
           4       0.98      0.88      0.93       145
           5       0.67      0.18      0.28        56
           6       0.64      0.44      0.52       143
           7       0.60      0.34      0.43       253
           8       0.70      0.55      0.62       176
           9       0.64      0.27      0.38        67
          10       0.82      0.51      0.63        83
          11       0.74      0.49      0.59       347
          12       0.74      0.57      0.65      1348
          

In [87]:
svc_classifier = OneVsRestClassifier(LinearSVC(penalty='l2', loss='hinge'), n_jobs=-1)
svc_classifier.fit(x_train_multilabel, y_train)
predictions = svc_classifier.predict (x_test_multilabel)
metrics_report(y_test, predictions)

Accuracy : 0.3713525575008582
Hamming loss  0.009361483007209064
Micro-average quality numbers
Precision: 0.8524, Recall: 0.4917, F1-measure: 0.6237
Macro-average quality numbers
Precision: 0.7629, Recall: 0.4058, F1-measure: 0.5026
              precision    recall  f1-score   support

           0       0.70      0.16      0.26       522
           1       0.69      0.46      0.55        78
           2       0.85      0.30      0.45       152
           3       0.97      0.83      0.89       933
           4       0.98      0.82      0.89       145
           5       0.00      0.00      0.00        56
           6       0.65      0.36      0.46       143
           7       0.70      0.27      0.39       253
           8       0.74      0.56      0.64       176
           9       0.70      0.21      0.32        67
          10       0.90      0.43      0.59        83
          11       0.82      0.39      0.53       347
          12       0.86      0.54      0.66      1348
          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [96]:
logreg_classifier = OneVsRestClassifier(LogisticRegression(penalty='l2', max_iter=500, n_jobs=-1))
logreg_classifier.fit(x_train_multilabel, y_train)
predictions = logreg_classifier.predict (x_test_multilabel)
metrics_report(y_test, predictions)

Accuracy : 0.2482835564709921
Hamming loss  0.011404050806728459
Micro-average quality numbers
Precision: 0.8492, Recall: 0.3370, F1-measure: 0.4825
Macro-average quality numbers
Precision: 0.7296, Recall: 0.2345, F1-measure: 0.3385
              precision    recall  f1-score   support

           0       0.66      0.12      0.20       522
           1       0.70      0.29      0.41        78
           2       0.90      0.17      0.29       152
           3       0.98      0.68      0.80       933
           4       0.98      0.44      0.61       145
           5       0.00      0.00      0.00        56
           6       0.67      0.31      0.42       143
           7       0.69      0.18      0.28       253
           8       0.73      0.38      0.50       176
           9       0.60      0.04      0.08        67
          10       0.95      0.24      0.38        83
          11       0.87      0.25      0.39       347
          12       0.86      0.41      0.56      1348
          

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
perc_classifier = OneVsRestClassifier(Perceptron(alpha=0.00001, penalty='l1', n_jobs=-1))
perc_classifier.fit(x_train_multilabel, y_train)
predictions = perc_classifier.predict (x_test_multilabel)
metrics_report(y_test, predictions)

Accuracy : 0.20537246824579472
Hamming loss  0.015929454170957775
Micro-average quality numbers
Precision: 0.4956, Recall: 0.5536, F1-measure: 0.5230
Macro-average quality numbers
Precision: 0.4969, Recall: 0.4915, F1-measure: 0.4655
              precision    recall  f1-score   support

           0       0.24      0.32      0.27       522
           1       0.39      0.56      0.46        78
           2       0.50      0.41      0.45       152
           3       0.83      0.88      0.85       933
           4       0.90      0.86      0.88       145
           5       0.13      0.05      0.08        56
           6       0.32      0.08      0.13       143
           7       0.35      0.38      0.36       253
           8       0.46      0.57      0.51       176
           9       0.38      0.37      0.38        67
          10       0.69      0.49      0.58        83
          11       0.49      0.17      0.26       347
          12       0.48      0.70      0.57      1348
         

  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
joblib.dump(sgd_classifier, 'SGD_2_clf.joblib')
joblib.dump(logreg_classifier, 'LogReg_2_clf.joblib')
joblib.dump(pa_classifier, 'PassAgg_2_clf.joblib')
joblib.dump(svc_classifier, 'LinearSVC_2_clf.joblib')
joblib.dump(perc_classifier,'Perceptron_2_clf.joblib')

['Perceptron_2_clf.joblib']

In [134]:
vectorizer_2 = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,2))

In [135]:
x_train_multilabel = vectorizer_2.fit_transform(X_train.astype('U'))

In [142]:
save_vec = open("test_vec.pkl", 'wb')
pickled.dump(vectorizer_2, save_vec)

In [147]:
save_count_vec = open("test_count_vec.pickle", 'wb')
pickled.dump(count_vectorizer, save_vec)

In [149]:
with open("test_count_vec.pickle", "wb") as f:
    pickled.dump(count_vectorizer, f)

In [150]:
with open("test_vec.pickle", "wb") as f:
    pickled.dump(vectorizer_2, f)