In [1]:
import sklearn
import pandas as pd
import numpy as np

In [14]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
import xgboost
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, auc, roc_curve, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

#### Reading the file saved after preprocessing and cleaning

In [3]:
data = pd.read_csv('data/4_final_dataset.csv').drop("Unnamed: 0", axis=1)

In [4]:
data.head()

Unnamed: 0,flair,title,score,id,url,comms_num,created,body,author,comments,authors,feature_combine
0,Politics -- Source in comments,mumbai students protesting caa nrc wankhede st...,1391,eok4qb,https://i.redd.it/y4jcbkiedqa41.jpg,116,1970-01-01 00:00:01.579030566,,Gavthi_Batman,im extremely biased towards ktm duke 200 390 i...,I'm extremely biased towards the KTM Duke (bo...,mumbai students protesting caa nrc wankhede st...
1,Politics -- Source in comments,amit shah looking source comment,16,fbx2it,https://i.redd.it/6heuj8xxf3k41.png,4,1970-01-01 00:00:01.583111542,,sickcooler,im extremely biased towards ktm duke 200 390 i...,I'm extremely biased towards the KTM Duke (bo...,amit shah looking source comment im extremely ...
2,Politics -- Source in comments,real loser indias errupting islamaphobia caste...,81,g76o5f,https://www.reddit.com/r/india/comments/g76o5f...,53,1970-01-01 00:00:01.587756081,tldr unqualified opinion dalit political movem...,HairLikeWinterFire,im extremely biased towards ktm duke 200 390 i...,I'm extremely biased towards the KTM Duke (bo...,real loser indias errupting islamaphobia caste...
3,Politics -- Source in comments,annual reminder indias ayush minister promised...,398,fu1ly8,https://www.reddit.com/r/india/comments/fu1ly8...,43,1970-01-01 00:00:01.585916565,spoiler dont please note said prevention cure ...,madamplease,im extremely biased towards ktm duke 200 390 i...,I'm extremely biased towards the KTM Duke (bo...,annual reminder indias ayush minister promised...
4,Politics -- Source in comments,mp covid19 megathread,21,fpt2jw,https://www.reddit.com/r/india/comments/fpt2jw...,19,1970-01-01 00:00:01.585324794,thread sharing coronavirus news updates relate...,maardon_bhenji,im extremely biased towards ktm duke 200 390 i...,I'm extremely biased towards the KTM Duke (bo...,mp covid19 megathread im extremely biased towa...


### Data Split to training and testing samples

In [6]:
labels = data.flair
features = data.feature_combine
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42, test_size=0.2)

class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(np.ravel(y_train.values)),
                                                 np.ravel(y_train.values))

In [7]:
flairs = np.unique(np.ravel(y_train.values))

#### Utility function: conversion to one-hot vector for classification

In [8]:
def one_vector(label):
    temp = np.zeros(label.shape[0])
    for i in range(label.shape[0]):
        temp[i] = np.where(label[i] == 1)[0]
    return temp

#### Function to try out multiple algorithms while trying out which algorithm performs best. The function takes a list of algorithms and returns the classification report for each of them. Takes quite a while as GradientBoosting does not support multithreading yet. 

In [9]:
def test_algorithms(algorithms, X_train, y_train, X_test, y_test):
    results = {}
    for algo in algorithms:
        algorithm = Pipeline([('vect', CountVectorizer()),
                             ('tfodf', TfidfTransformer()),
                             ('clf', algo)], verbose=True)
        print(algorithm)
        algorithm.fit(X_train, y_train)
        cv_scores = cross_val_score(algorithm, X_train, y_train, cv=5)
        print('cv_scores:',cv_scores)
        print('cv_scores mean:{}'.format(np.mean(cv_scores)))
        results[algorithm]= np.mean(cv_scores)
        y_pred = algorithm.predict(X_test)
        print(y_pred.shape)
        print(classification_report(y_test, y_pred))
    return results, algo

#### Defining different algorithms to try out

In [11]:
dtc = DecisionTreeClassifier()
clf = SVC(C=0.9, kernel='rbf')
sgd = SGDClassifier(loss='hinge',
                   penalty = 'l2',
                   alpha = 1e-5,
                   max_iter=5, tol = None)
rfc = RandomForestClassifier(n_estimators=1000,
                            random_state=42)

xgb = xgboost.XGBClassifier(objective='multi:softmax',
                            n_estimators=1000,
                           random_state=42,
                           learning_rate=0.001,
                           n_jobs= 6,
                           verbose=True)
gbc = GradientBoostingClassifier(n_estimators=1000,
                                learning_rate=0.005)

In [None]:
algos = [clf, dtc, rfc, sgd, gbc]

#X_train = X_train.combined_features
#X_test = X_test.combined_features
res, mod = test_algorithms(algos, X_train, y_train, X_test, y_test)

#### Finetuning and training the best performing algorithm from pervious cell

In [141]:
algorithm = Pipeline([('vect', CountVectorizer()),
          ('tfodf', TfidfTransformer()),
          ('clf', xgb)], verbose = True)
print(algorithm)
algorithm.fit(X_train, y_train)
cv_scores = cross_val_score(algorithm, X_train, y_train, cv=5)
print('cv_scores:',cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                               learning_rate=0.001, max_delta_step=None,
                               max_depth=None, min_child_weight=None,
                               missing=nan, monotone_constraints=None,
                               n_estimators=1000, n_jobs=6,
                               num_parallel_tree=None,
       

In [12]:
algorithm = Pipeline([('vect', CountVectorizer()),
          ('tfodf', TfidfTransformer()),
          ('clf', gbc)])
print(algorithm)
algorithm.fit(X_train, y_train)
cv_scores = cross_val_score(algorithm, X_train, y_train, cv=5)
print('cv_scores:',cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                            loss='deviance', max_depth=3,
                                            max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=

#### Saving the model

In [17]:
import joblib
joblib.dump(algorithm, "gb_model.sav")

['gb_model.sav']

### Model Evaluation

In [13]:
#results[algorithm]= np.mean(cv_scores)
y_pred = algorithm.predict(X_test)
print(y_pred.shape)
print(classification_report(y_test, y_pred))

(400,)
                                precision    recall  f1-score   support

                      AskIndia       0.94      0.71      0.81        24
              Business/Finance       0.60      0.43      0.50        21
                       CAA-NRC       0.91      0.84      0.87        25
                   CAA-NRC-NPR       0.67      0.62      0.65        16
                   Coronavirus       0.84      0.89      0.86        18
                Demonetization       0.90      0.72      0.80        25
                          Food       0.78      0.90      0.84        20
                 Non-Political       0.89      0.94      0.92        18
                           Old       0.82      0.78      0.80        18
                   Photography       0.94      0.94      0.94        18
              Policy & Economy       0.11      0.19      0.14        21
                Policy/Economy       0.30      0.21      0.25        28
 Policy/Economy -2017 Article        0.26      0.28     

In [16]:
print(confusion_matrix(y_test, y_pred))

[[17  0  0  0  0  0  0  1  0  1  1  0  1  1  0  1  0  0  0  1]
 [ 1  9  0  0  0  0  1  0  1  0  5  0  1  0  0  1  1  0  1  0]
 [ 0  0 21  3  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  2 10  0  0  0  0  0  0  0  0  0  1  3  0  0  0  0  0]
 [ 0  0  0  0 16  0  1  0  0  0  0  0  1  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 18  1  0  2  0  0  0  0  2  1  0  0  0  1  0]
 [ 0  1  0  0  0  0 18  0  0  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  2  1  0  0 14  0  0  0  0  1  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0 17  0  0  0  0  0  0  0  1  0  0]
 [ 0  0  0  1  0  0  0  0  0  0  4 11  5  0  0  0  0  0  0  0]
 [ 0  1  0  0  0  1  0  0  0  0 17  6  1  1  0  0  0  0  1  0]
 [ 0  2  0  1  0  0  1  0  0  0  5  2  5  1  0  0  1  0  0  0]
 [ 0  0  0  0  0  0  1  1  0  0  0  0  0 20  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  1  0  3  1 11  0  0  0  0  2]
 [ 0  0  0  0  0  0  0  0  0  0  0  1  0  3  0 13  0  0

### NOTE:
The model does not differentiate well between set of similar flairs: such as:
Policy & Economy, Policy/Economy, Policy/Economy -2017 Article and CAA-NRC and CAA-NRC-NPR

While this can be solved by merging the classes under consideration, it would techincally change the expected class during testing. This is not implemented yet. 