<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preparing-datasets" data-toc-modified-id="Preparing-datasets-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preparing datasets</a></span></li><li><span><a href="#Binary-Relevance" data-toc-modified-id="Binary-Relevance-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Binary Relevance</a></span><ul class="toc-item"><li><span><a href="#TF-IDF-+-Naive-Bayes" data-toc-modified-id="TF-IDF-+-Naive-Bayes-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>TF-IDF + Naive Bayes</a></span><ul class="toc-item"><li><span><a href="#GridSearchCV" data-toc-modified-id="GridSearchCV-2.1.1"><span class="toc-item-num">2.1.1&nbsp;&nbsp;</span>GridSearchCV</a></span></li></ul></li><li><span><a href="#TF-IDF-+-SVC" data-toc-modified-id="TF-IDF-+-SVC-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>TF-IDF + SVC</a></span></li><li><span><a href="#TF-IDF-+-Logistic-Regression" data-toc-modified-id="TF-IDF-+-Logistic-Regression-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>TF-IDF + Logistic Regression</a></span></li><li><span><a href="#CountVectorizer-+-SVC" data-toc-modified-id="CountVectorizer-+-SVC-2.4"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>CountVectorizer + SVC</a></span></li></ul></li></ul></div>

In [2]:
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
import missingno as msno
import re
import os.path
import math
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from scipy import sparse
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from skmultilearn.problem_transform import LabelPowerset
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


import nltk
from wordcloud import WordCloud
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import unicodedata

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer


#plt.style.use('fivethirtyeight')
sns.set_style("whitegrid")
sns.set_context("talk", font_scale=0.8)

from helper_functions import *
overall_f1_score_v1_cv = make_scorer(overall_f1_score_v1, greater_is_better=True)

## Preparing datasets 

In [3]:
mydata_train = pd.read_csv('./Data/Prepossed/Prepossed_data_train_shuffled.csv')
mydata_test = pd.read_csv('./Data/Prepossed/Prepossed_data_test_Shuffled.csv')
mydata =  pd.read_csv('./Data/Prepossed/Prepossed_data.csv')
mydata = mydata.drop(['Unnamed: 0', 'Unnamed: 0.1'],axis = 1)
mydata_train = mydata_train.drop(['Unnamed: 0.1'], axis = 1)
mydata_test = mydata_test.drop(['Unnamed: 0.1'], axis = 1)

train_X, train_y = mydata_train['Plot'], mydata_train.drop(['BID', 'Plot'], axis=1)
test_X, test_y = mydata_test['Plot'], mydata_test.drop(['BID', 'Plot'], axis=1)

category_columns = train_y.columns

## Binary Relevance
Binary Classfication에서 쓰이는 One vs Rest방법 사용  
Grid Search를 통하여 best hyperparameter를 얻고  
Threshold를 0.5 와 빈도수 확률로 지정하여 결과 비교

In [4]:
prob_thresh = get_prob_thresh(mydata[category_columns], thresh_sel=2)


### TF-IDF + Naive Bayes

#### GridSearchCV

In [5]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None)))
            ])
# sorted(pipeline.get_params().keys()) # -- to obtain the GridSearchCV parameter names
parameters = {
                'tfidf__max_df': (0.25, 0.5, 0.75),
                'tfidf__ngram_range': [(1, 2)],
                'tfidf__min_df': [1, 3, 5],
                'clf__estimator__alpha': (1e-2, 1e-3)
            }

grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, n_jobs=3, verbose=10)
grid_search_cv.fit(train_X, train_y)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
prob_thresh = get_prob_thresh(test_y, thresh_sel = 2, thresh_offset= 0)

y_pred = pd.DataFrame(columns=category_columns)
prob = best_clf.predict_proba(test_X)

for idx, col in enumerate(category_columns):
        y_pred[col] = prob[:,idx]>prob_thresh[idx]

#prob, predictions = multi_label_predict(best_clf, test_X, prob_thresh) 
accuracy(test_y, y_pred).drop(['Support'], axis = 1)

Fitting 2 folds for each of 18 candidates, totalling 36 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    2.6s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    4.3s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    5.3s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    8.0s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    9.9s
[Parallel(n_jobs=3)]: Done  36 out of  36 | elapsed:   12.6s finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=5, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None))]

Applying best classifier on test data:


Unnamed: 0,Precision,Recall,F1-Score,Accuracy
math,0.49,0.48,0.49,0.73
dp,0.43,0.38,0.4,0.74
implementation,0.48,0.51,0.49,0.66
greedy,0.44,0.42,0.43,0.71
data structures,0.37,0.37,0.37,0.77
sortings,0.26,0.25,0.25,0.84
dfs,0.47,0.54,0.5,0.87
graphs,0.53,0.65,0.58,0.89
trees,0.57,0.72,0.64,0.92
constructive algorithms,0.27,0.24,0.26,0.79


**Using the best parameters obtained from gridsearch using 0.5 Probability threshold**


In [6]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.25, min_df = 5 ,ngram_range=(1, 2),  norm='l2')),
                ('clf', OneVsRestClassifier(MultinomialNB(alpha=0.01, fit_prior=True, class_prior=None)))
            ])
pipeline.fit(train_X, train_y)
predictions = pipeline.predict(test_X)
accuracy(test_y, y_pred).drop(['Support'], axis = 1)

Unnamed: 0,Precision,Recall,F1-Score,Accuracy
math,0.49,0.48,0.49,0.73
dp,0.43,0.38,0.4,0.74
implementation,0.48,0.51,0.49,0.66
greedy,0.44,0.42,0.43,0.71
data structures,0.37,0.37,0.37,0.77
sortings,0.26,0.25,0.25,0.84
dfs,0.47,0.54,0.5,0.87
graphs,0.53,0.65,0.58,0.89
trees,0.57,0.72,0.64,0.92
constructive algorithms,0.27,0.24,0.26,0.79


### TF-IDF + SVC

In [7]:
pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
parameters = {
            'tfidf__max_df': (0.25, 0.5),
            'tfidf__min_df': (1, 3),
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            "clf__estimator__C": [1, 10],
            "clf__estimator__class_weight": ['balanced'],
            }

grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, n_jobs=4, verbose=10)
grid_search_cv.fit(train_X, train_y)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = best_clf.predict(test_X) 
accuracy(test_y, y_pred).drop(['Support'], axis = 1)

Fitting 2 folds for each of 16 candidates, totalling 32 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    3.2s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:    5.6s
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    8.9s
[Parallel(n_jobs=4)]: Done  29 out of  32 | elapsed:   12.7s remaining:    1.3s
[Parallel(n_jobs=4)]: Done  32 out of  32 | elapsed:   13.1s finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1, class_weight='balanced', dual=True,
                                        fit_intercept=True, intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0),
                    n_jobs=

Unnamed: 0,Precision,Recall,F1-Score,Accuracy
math,0.49,0.48,0.49,0.73
dp,0.43,0.38,0.4,0.74
implementation,0.48,0.51,0.49,0.66
greedy,0.44,0.42,0.43,0.71
data structures,0.37,0.37,0.37,0.77
sortings,0.26,0.25,0.25,0.84
dfs,0.47,0.54,0.5,0.87
graphs,0.53,0.65,0.58,0.89
trees,0.57,0.72,0.64,0.92
constructive algorithms,0.27,0.24,0.26,0.79


**Trying out individual hyperparameters**

In [8]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 2))),
                ('clf', OneVsRestClassifier(LinearSVC(C=1, class_weight='balanced'), n_jobs=3))
            ])
pipeline.fit(train_X, train_y)

predictions = pipeline.predict(test_X) 
accuracy(test_y, y_pred).drop(['Support'], axis = 1)

Unnamed: 0,Precision,Recall,F1-Score,Accuracy
math,0.49,0.48,0.49,0.73
dp,0.43,0.38,0.4,0.74
implementation,0.48,0.51,0.49,0.66
greedy,0.44,0.42,0.43,0.71
data structures,0.37,0.37,0.37,0.77
sortings,0.26,0.25,0.25,0.84
dfs,0.47,0.54,0.5,0.87
graphs,0.53,0.65,0.58,0.89
trees,0.57,0.72,0.64,0.92
constructive algorithms,0.27,0.24,0.26,0.79


### TF-IDF + Logistic Regression

In [9]:
pipeline = Pipeline([
            ('tfidf', TfidfVectorizer()),
            ('clf', OneVsRestClassifier(LogisticRegression(), n_jobs=1)),
            ])
parameters = {
            'tfidf__max_df': [0.25, 0.5, 0.75],
            'tfidf__min_df': [1, 2],
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            "clf__estimator__C": [0.1, 1],
            }

grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, n_jobs=3, verbose=10)
grid_search_cv.fit(train_X, train_y)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_

y_pred = pd.DataFrame(columns=category_columns)
prob = best_clf.predict_proba(test_X)

for idx, col in enumerate(category_columns):
        y_pred[col] = prob[:,idx]>prob_thresh[idx]
        
accuracy(test_y, y_pred).drop(['Support'], axis = 1)

Fitting 2 folds for each of 24 candidates, totalling 48 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:    1.4s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:    3.2s
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:    4.4s
[Parallel(n_jobs=3)]: Done  19 tasks      | elapsed:    5.8s
[Parallel(n_jobs=3)]: Done  26 tasks      | elapsed:    7.5s
[Parallel(n_jobs=3)]: Done  35 tasks      | elapsed:   10.2s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:   14.4s remaining:    0.0s
[Parallel(n_jobs=3)]: Done  48 out of  48 | elapsed:   14.4s finished



Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.25, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
              

Unnamed: 0,Precision,Recall,F1-Score,Accuracy
math,0.44,0.6,0.51,0.69
dp,0.4,0.59,0.48,0.7
implementation,0.47,0.58,0.52,0.64
greedy,0.44,0.61,0.51,0.7
data structures,0.3,0.56,0.39,0.69
sortings,0.21,0.6,0.32,0.72
dfs,0.46,0.56,0.51,0.87
graphs,0.46,0.65,0.54,0.87
trees,0.58,0.75,0.66,0.92
constructive algorithms,0.26,0.45,0.33,0.72


math                       0
dp                         0
implementation             0
greedy                     0
data structures            1
sortings                   0
dfs                        0
graphs                     0
trees                      0
constructive algorithms    1
brute force                0
strings                    0
binary search              1
number theory              0
geometry                   0
Name: 93, dtype: int64

**A Single Hyperparameter**


In [10]:
pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(max_df=0.5, min_df=2, ngram_range=(1, 1))),
                ('clf', OneVsRestClassifier(LogisticRegression(C=100)))
            ])
pipeline.fit(train_X, train_y)

y_pred = pd.DataFrame(columns=category_columns)
prob = best_clf.predict_proba(test_X)

for idx, col in enumerate(category_columns):
        y_pred[col] = prob[:,idx]>prob_thresh[idx]
        
accuracy(test_y, y_pred)

Unnamed: 0,Precision,Recall,F1-Score,Accuracy,Support
math,0.44,0.6,0.51,0.69,228.0
dp,0.4,0.59,0.48,0.7,197.0
implementation,0.47,0.58,0.52,0.64,284.0
greedy,0.44,0.61,0.51,0.7,221.0
data structures,0.3,0.56,0.39,0.69,155.0
sortings,0.21,0.6,0.32,0.72,92.0
dfs,0.46,0.56,0.51,0.87,102.0
graphs,0.46,0.65,0.54,0.87,101.0
trees,0.58,0.75,0.66,0.92,89.0
constructive algorithms,0.26,0.45,0.33,0.72,130.0


### CountVectorizer + SVC

In [11]:
pipeline = Pipeline([
            ('cvec', CountVectorizer()),
            ('clf', OneVsRestClassifier(LinearSVC())),
            ])
parameters = {
            'cvec__max_df': (0.25, 0.5),
            'cvec__min_df': (1, 2),
            'cvec__ngram_range': [(1, 1), (1, 2)],
            "clf__estimator__C": [1, 10, 100],
            "clf__estimator__class_weight": ['balanced'],
            }

grid_search_cv = GridSearchCV(pipeline, parameters, cv=2, verbose=10, scoring=overall_f1_score_v1_cv)
grid_search_cv.fit(train_X, train_y)

print()
print("Best parameters set:")
print (grid_search_cv.best_estimator_.steps)
print()

# measuring performance on test set
print ("Applying best classifier on test data:")
best_clf = grid_search_cv.best_estimator_
predictions = best_clf.predict(test_X) 
accuracy(test_y, predictions)

Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 1), score=0.337, total=   1.3s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.3s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 1), score=0.336, total=   1.3s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 2), score=0.339, total=   4.3s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    6.9s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=1, cvec__ngram_range=(1, 2), score=0.327, total=   4.5s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   11.3s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 1), score=0.335, total=   1.1s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   12.4s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 1), score=0.340, total=   1.1s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   13.5s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 2), score=0.351, total=   2.2s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 2) 


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:   15.7s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 2), score=0.341, total=   2.3s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:   18.0s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1), score=0.341, total=   1.2s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1) 


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:   19.2s remaining:    0.0s


[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1), score=0.339, total=   1.2s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 2) 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 2), score=0.347, total=   3.7s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 2) 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 2), score=0.327, total=   4.2s
[CV] clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=2, cvec__ngram_range=(1, 1) 
[CV]  clf__estimator__C=1, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=2, cvec__ngram_range=(1, 1), score=0.334, total=   1.2s
[CV] clf__est

[CV]  clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.25, cvec__min_df=2, cvec__ngram_range=(1, 2), score=0.338, total=   1.6s
[CV] clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1) 
[CV]  clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1), score=0.338, total=   1.2s
[CV] clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1) 
[CV]  clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 1), score=0.335, total=   1.1s
[CV] clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 2) 
[CV]  clf__estimator__C=100, clf__estimator__class_weight=balanced, cvec__max_df=0.5, cvec__min_df=1, cvec__ngram_range=(1, 2), score=0.345, total=   3.3

[Parallel(n_jobs=1)]: Done  48 out of  48 | elapsed:  1.6min finished



Best parameters set:
[('cvec', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.5, max_features=None, min_df=2,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LinearSVC(C=10, class_weight='balanced',
                                        dual=True, fit_intercept=True,
                                        intercept_scaling=1,
                                        loss='squared_hinge', max_iter=1000,
                                        multi_class='ovr', penalty='l2',
                                        random_state=None, tol=0.0001,
                                        verbose=0),
                    n_jobs=None))]

Applying best classifier on tes

Unnamed: 0,Precision,Recall,F1-Score,Accuracy,Support
math,0.47,0.43,0.45,0.72,228.0
dp,0.41,0.32,0.36,0.74,197.0
implementation,0.48,0.39,0.43,0.66,284.0
greedy,0.42,0.36,0.39,0.71,221.0
data structures,0.44,0.31,0.36,0.8,155.0
sortings,0.34,0.22,0.26,0.87,92.0
dfs,0.44,0.26,0.33,0.87,102.0
graphs,0.59,0.45,0.51,0.9,101.0
trees,0.76,0.51,0.61,0.93,89.0
constructive algorithms,0.3,0.21,0.25,0.81,130.0


**Single Hyperparameter Tuning**

In [12]:
pipeline = Pipeline([
                ('cvec', CountVectorizer(max_df=0.5, min_df=1, ngram_range=(1, 1))),
                ('clf', OneVsRestClassifier(LinearSVC(C=1, class_weight='balanced')))
            ])
pipeline.fit(train_X, train_y)

predictions = pipeline.predict(test_X) 
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Accuracy,Support
math,0.44,0.48,0.46,0.7,228.0
dp,0.35,0.42,0.38,0.69,197.0
implementation,0.43,0.45,0.44,0.62,284.0
greedy,0.4,0.46,0.43,0.68,221.0
data structures,0.42,0.41,0.42,0.79,155.0
sortings,0.26,0.27,0.26,0.84,92.0
dfs,0.35,0.29,0.32,0.85,102.0
graphs,0.42,0.47,0.44,0.86,101.0
trees,0.57,0.52,0.54,0.91,89.0
constructive algorithms,0.28,0.28,0.28,0.78,130.0


In [13]:
pipeline = Pipeline([
                ('cvec', CountVectorizer(max_df=0.5, min_df=1, ngram_range=(1, 2))),
                ('clf', OneVsRestClassifier(LinearSVC(C=1, class_weight='balanced')))
            ])
pipeline.fit(train_X, train_y)

predictions = pipeline.predict(test_X) 
accuracy(test_y, predictions)

Unnamed: 0,Precision,Recall,F1-Score,Accuracy,Support
math,0.51,0.41,0.45,0.74,228.0
dp,0.47,0.31,0.37,0.76,197.0
implementation,0.51,0.4,0.45,0.68,284.0
greedy,0.43,0.33,0.38,0.71,221.0
data structures,0.51,0.33,0.4,0.82,155.0
sortings,0.36,0.13,0.19,0.88,92.0
dfs,0.51,0.24,0.32,0.88,102.0
graphs,0.66,0.4,0.49,0.9,101.0
trees,0.8,0.51,0.62,0.94,89.0
constructive algorithms,0.32,0.16,0.21,0.82,130.0
