In [4]:
#@title Importing Dependencies
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score , f1_score , hamming_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split


from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import model_selection
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix

import logging
from scipy.sparse import hstack


In [5]:
np.random.seed(seed=11)

In [6]:
df = pd.read_csv('/content/drive/MyDrive/GP storage/NLP_project/Dataset/New Processed Data.csv')

In [None]:
df.dropna(inplace = True , axis = 0)
df = df[df['Score'] >= 5]
df

Unnamed: 0,Score,Title,Body,Tags
0,21,asp.net site map,anyone get experience create sql-based asp.net...,"['sql', 'asp.net']"
1,53,function create color wheel,something pseudo-solved many time never quite ...,['algorithm']
2,49,add script functionality .net applications,little game write c use database back-end trad...,"['c#', '.net']"
3,29,use nest class case,work collection class use video playback recor...,"['c++', 'class']"
4,13,homegrown consumption web service,write web service .net app ready consume see n...,"['.net', 'web-services']"
...,...,...,...,...
1056440,7,use dict subset dataframe,say give dataframe columns categorical data da...,['python']
1056559,7,way use itertools python clean nest iterations,let say follow code 123 b 246 c 357 j b k c pr...,['python']
1056711,5,cmfcmenubutton properly repaint toggle high co...,c++ mfc project use cmfcmenubutton use msvc 20...,['c++']
1056866,6,result data return void get break,work project huge legacy code base try re-desi...,['c++']


In [None]:
X1 = df['Body']
X2 = df['Title']
y = df['Tags']

In [None]:
X = pd.DataFrame()
X['X1'] = df['Body']
X['X2'] = df['Title'] 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 

In [None]:
multilabel_binarizer = MultiLabelBinarizer()
y_train = multilabel_binarizer.fit_transform(y_train)
y_test = multilabel_binarizer.transform(y_test)

In [None]:
vectorizer_X1 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

vectorizer_X2 = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

In [None]:
X1_tfidf_train = vectorizer_X1.fit_transform(X_train['X1'])
X2_tfidf_train = vectorizer_X2.fit_transform(X_train['X2'])

X1_tfidf_test = vectorizer_X1.transform(X_test['X1'])
X2_tfidf_test = vectorizer_X2.transform(X_test['X2'])

In [None]:
X_tfidf_train = hstack([X1_tfidf_train,X2_tfidf_train])
X_tfidf_test = hstack([X1_tfidf_test,X2_tfidf_test])

In [None]:
#@title Printing Scores Function
def print_score(y_pred, clf , mode):
    print("Clf: ", clf.__class__.__name__)
    if mode =='train':
       print("training Hamming loss: {}".format(hamming_loss(y_pred, y_train)*100))
       print("training F1 Score micro: {}".format(f1_score(y_train,y_pred,average = 'micro')))
       print("training F1 Score weighted: {}".format(f1_score(y_train,y_pred,average = 'weighted')))
    else:
       print("test Hamming loss: {}".format(hamming_loss(y_pred, y_test)*100))
       print("test F1 Score micro: {}".format(f1_score(y_test,y_pred,average = 'micro')))
       print("test F1 Score weighted: {}".format(f1_score(y_test,y_pred,average = 'weighted')))

    print("---")    

In [None]:
#@title Trying DummyClassifier , SGDClassifier , LogisticRegression , MultinomialNB , LinearSVC , Perceptron , PassiveAggressiveClassifier
dummy = DummyClassifier() 
sgd = SGDClassifier()
lr = LogisticRegression()
mn = MultinomialNB()
svc = LinearSVC()
perceptron = Perceptron()
pac = PassiveAggressiveClassifier()

for classifier in [dummy, sgd, lr, mn, svc, perceptron, pac]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(X_tfidf_train, y_train)
    y_pr = clf.predict(X_tfidf_train)
    print_score(y_pr, classifier , 'train')
    y_pred = clf.predict(X_tfidf_test)
    print_score(y_pred, classifier , 'test')

In [None]:
#@title MLP Classifier
mlpc = MLPClassifier()
mlpc.fit(X_tfidf_train, y_train)
y_pred = mlpc.predict(X_tfidf_test)
print_score(y_pred, mlpc)

In [None]:
#@title Random Forest Classifier
rfc = RandomForestClassifier()
rfc.fit(X_tfidf_train, y_train)
y_pred = rfc.predict(X_tfidf_test)
print_score(y_pred, rfc , 'test')

In [None]:
#@title Function to Calculate Scorer result
def Calc_F1_Score(y_true,y_pred):
  return f1_score(y_true,y_pred,average = 'micro')



In [None]:
#@title Getting the Best classifier choosen , By GridsearchCV
param_grid = {'estimator__C':[1,10,100,1000]}
svc = OneVsRestClassifier(LinearSVC())
CV_svc = model_selection.GridSearchCV(estimator=svc, param_grid=param_grid, cv= 5, verbose=10, scoring=make_scorer(Calc_F1_Score , greater_is_better=True))
CV_svc.fit(X_tfidf_train, y_train)


In [None]:
#@title Fiting the best model
CV_svc.best_params_
best_model = CV_svc.best_estimator_
y_pred = best_model.fit(X_train , y_train)
print_score(y_pred, best_model , 'test')

ValueError: ignored

In [None]:
#@title The Confusion Matrix
for i in range(y_train.shape[1]):
    print(multilabel_binarizer.classes_[i])
    print(confusion_matrix(y_test[:,i], y_pred[:,i]))
    print("")

In [None]:
#@title Extracting Best Features
def print_top10(feature_names, clf, class_labels):
    """Prints features with the highest coefficient values, per class"""
    for i, class_label in enumerate(class_labels):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print("--------------------------------------------")
        print("%s: %s" % (class_label,
              " ".join(feature_names[j] for j in top10)))
        print("--------------------------------------------")

feature_names = vectorizer_X1.get_feature_names() + vectorizer_X2.get_feature_names()
print_top10(feature_names, best_model, multilabel_binarizer.classes_)