In [94]:
import pandas as pd
import numpy as np
from ast import literal_eval
import ast

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import hamming_loss
from sklearn.cluster import KMeans

from scipy.sparse import hstack

In [142]:
df = pd.read_csv('StackSample_kaggle.csv', encoding="utf-8", sep=';')

In [143]:
df.head()

Unnamed: 0.1,Unnamed: 0,ï»¿,Title,Body,Tags
0,0,1,good branch merge tutorials tortoisesvn,really good tutorials explain branch merge apa...,['svn']
1,1,2,asp.net site map,anyone get experience create sql-based asp.net...,"['sql', 'asp.net']"
2,2,3,function create color wheel,something pseudo-solved many time never quite ...,['algorithm']
3,3,4,add script functionality .net applications,little game write c use database back-end trad...,"['c#', '.net']"
4,4,5,use nest class case,work collection class use video playback recor...,"['c++', 'oop', 'class']"


In [147]:
#df['Title'] = df['Title'].apply(literal_eval)
#df['Body'] = df['Body'].apply(literal_eval)
df['Tags'] = df['Tags'].apply(literal_eval)

In [148]:
df['Tags'][45]

['regex']

In [101]:
#df['Title String'] = df['Lemmatized Title'].apply(lambda x: ' '.join(map(str, x)))
#df['Body String'] = df['Lemmatized Body'].apply(lambda x: ' '.join(map(str, x)))

In [189]:
X1 = df['Body']
X2 = df['Title']
y = df['Tags']

In [190]:
type(X2[0])

str

In [191]:
multilabel_binarizer = MultiLabelBinarizer()
y_bin = multilabel_binarizer.fit_transform(y)

In [192]:
X1

0        really good tutorials explain branch merge apa...
1        anyone get experience create sql-based asp.net...
2        something pseudo-solved many time never quite ...
3        little game write c use database back-end trad...
4        work collection class use video playback recor...
                               ...                        
58253    background look potentially use mpc5200 static...
58254    want create app apple watch give user simple p...
58255    pass value template events html template name ...
58256    get weird error sql server cannot find solutio...
58257    scalaz every monad instance automatically inst...
Name: Body, Length: 58258, dtype: object

In [193]:
vectorizer_X1 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

vectorizer_X2 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

In [194]:
X1_tfidf = vectorizer_X1.fit_transform(X1.astype('U'))
X2_tfidf = vectorizer_X2.fit_transform(X2.astype('U'))

In [195]:
X_tfidf = hstack([X1_tfidf,X2_tfidf])

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_bin, test_size = 0.2, random_state = 0)

In [197]:
def avg_jacard(y_true,y_pred):
    jacard = np.minimum(y_true,y_pred).sum(axis=1) / np.maximum(y_true,y_pred).sum(axis=1)
    return jacard.mean()*100

def print_score(y_pred, clf, y_test):
    print("Clf: ", clf.__class__.__name__)
    print("Jacard score: {}".format(avg_jacard(y_test, y_pred)))
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test)*100))
    print("---")    

In [161]:
dummy = DummyClassifier()
sgd = SGDClassifier()
lr = LogisticRegression()
mn = MultinomialNB()
svc = LinearSVC()
perceptron = Perceptron()
pac = PassiveAggressiveClassifier()

for classifier in [dummy, sgd, lr, mn, svc, perceptron, pac]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print_score(y_pred, classifier)

Clf:  DummyClassifier
Jacard score: 0.0
Hamming loss: 1.5775832475111569
---
Clf:  SGDClassifier
Jacard score: 47.137973452340084
Hamming loss: 0.974339169241332
---
Clf:  LogisticRegression
Jacard score: 47.852586108250364
Hamming loss: 0.9830930312392723
---
Clf:  MultinomialNB
Jacard score: 40.08093848593334
Hamming loss: 1.1818571918983864
---
Clf:  LinearSVC
Jacard score: 51.91140290651105
Hamming loss: 0.9703055269481633
---
Clf:  Perceptron
Jacard score: 46.131115851764676
Hamming loss: 1.5005149330587024
---
Clf:  PassiveAggressiveClassifier
Jacard score: 48.471479792283084
Hamming loss: 1.2925677995193958
---


In [199]:
svc = LinearSVC()
clf = OneVsRestClassifier(svc)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print_score(y_pred, svc, y_test)

Clf:  LinearSVC
Jacard score: 51.91140290651105
Hamming loss: 0.9703055269481633
---


In [200]:
y_pred = clf.predict(X_train)
print_score(y_pred, svc, y_train)

Clf:  LinearSVC
Jacard score: 66.91005449942068
Hamming loss: 0.6284598549542978
---


In [203]:
def tags_predict(classifier, qn, title):
    X1_tfidf = vectorizer_X1.transform([qn])
    X2_tfidf = vectorizer_X2.transform([title])
    
    X_tfidf = hstack([X1_tfidf, X2_tfidf])
    
    y_pred = classifier.predict(X_tfidf)
    print(multilabel_binarizer.inverse_transform(y_pred))

In [210]:
tags_predict(clf, X1[45], X2[45])

[('c#', 'regex')]


In [211]:
print(y[45])

['regex']
