In [6]:
import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import ast


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer #one hot encoding
from sklearn.multiclass import OneVsRestClassifier

from sklearn.svm import LinearSVC

In [10]:
data = pd.read_csv("https://raw.githubusercontent.com/laxmimerit/All-CSV-ML-Data-Files-Download/master/stackoverflow.csv", index_col = 0)

In [11]:
data.head()

Unnamed: 0,Text,Tags
2,aspnet site maps has anyone got experience cre...,"['sql', 'asp.net']"
4,adding scripting functionality to net applicat...,"['c#', '.net']"
5,should i use nested classes in this case i am ...,['c++']
6,homegrown consumption of web services i have b...,['.net']
8,automatically update version number i would li...,['c#']


In [14]:
data.iloc[0,1]
ast.literal_eval(data.iloc[0,1])

"['sql', 'asp.net']"

['sql', 'asp.net']

In [16]:
#Convert the string to list
data["Tags"]=data["Tags"].apply(lambda x: ast.literal_eval(x))
data.iloc[0,1]

['sql', 'asp.net']

In [17]:
#One hot encoding
mb = MultiLabelBinarizer()
y = mb.fit_transform(data["Tags"])

y
cls = mb.classes_
cls

array([[0, 0, 1, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

array(['.net', 'android', 'asp.net', 'c', 'c#', 'c++', 'css', 'html',
       'ios', 'iphone', 'java', 'javascript', 'jquery', 'mysql',
       'objective-c', 'php', 'python', 'ruby', 'ruby-on-rails', 'sql'],
      dtype=object)

In [19]:
#data is already cleaned up with basic text preprocessing 
tfidf = TfidfVectorizer(ngram_range=(1,3), analyzer="word",max_features=1000, stop_words="english")
X = tfidf.fit_transform(data["Text"])

# tfidf.vocabulary_

In [20]:
X.shape
y.shape

(48976, 1000)

(48976, 20)

In [21]:
#Train test split

X_train,X_test,y_train,y_test= train_test_split(X,y,random_state=0,test_size=0.2)
X_train.shape
X_test.shape
y_train.shape
y_test.shape

(39180, 1000)

(9796, 1000)

(39180, 20)

(9796, 20)

In [22]:
#Jaccard score = (A intersection B)/(A Union B)  Metrics for Multilabel Classification
def j_score(y_test,y_pred):
    js = np.minimum(y_test,y_pred).sum(axis=1)/np.maximum(y_test,y_pred).sum(axis=1)
    return js.mean()*100


In [26]:
#Model Training - Logreg
lr = LogisticRegression(solver="lbfgs")
clf = OneVsRestClassifier(lr)
clf.fit(X_train,y_train)

OneVsRestClassifier(estimator=LogisticRegression())

In [28]:
y_pred = clf.predict(X_test)

In [29]:
j_score(y_test,y_pred)

49.12668436096366

In [32]:
#Model Training with SVM
svm = LinearSVC(penalty="l1",C=1.5,dual =False)
clf = OneVsRestClassifier(svm)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
j_score(y_test,y_pred)

OneVsRestClassifier(estimator=LinearSVC(C=1.5, dual=False, penalty='l1'))

53.3787600381108

In [48]:
#testing

inp = ["python pip installation missing can I use C# or java for the same?"]
inp_tfidf = tfidf.transform(inp)
clf.predict(inp_tfidf)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [49]:
mb.inverse_transform(clf.predict(inp_tfidf))

[('java', 'python')]

In [52]:
#Model Save

import pickle

pickle.dump(clf,open("model.pkl","wb"))
pickle.dump(tfidf,open("tfidf.pkl","wb"))