In [1]:
import sklearn
import pandas as pd
import numpy as np

In [60]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, auc, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [72]:
data = pd.read_csv('data/cleaned_data.csv').drop("Unnamed: 0", axis=1)

In [73]:
data.head()

Unnamed: 0,flair,title,score,id,url,comms_num,created,body,author,comments,authors,feature_combine
0,Politics -- Source in comments,mumbai students protesting caa nrc wankhede st...,1395,eok4qb,https://i.redd.it/y4jcbkiedqa41.jpg,116.0,1970-01-01 00:00:01.579030566,,Gavthi_Batman,found myfitnesspal pretty good indian food pac...,I found MyFitnessPal to be pretty good with I...,mumbai students protesting caa nrc wankhede st...
1,Politics -- Source in comments,amit shah looking source comment,18,fbx2it,https://i.redd.it/6heuj8xxf3k41.png,4.0,1970-01-01 00:00:01.583111542,,sickcooler,found myfitnesspal pretty good indian food pac...,I found MyFitnessPal to be pretty good with I...,amit shah looking source comment found myfitne...
2,Politics -- Source in comments,annual reminder indias ayush minister promised...,395,fu1ly8,https://www.reddit.com/r/india/comments/fu1ly8...,43.0,1970-01-01 00:00:01.585916565,spoiler dont please note said prevention cure ...,madamplease,found myfitnesspal pretty good indian food pac...,I found MyFitnessPal to be pretty good with I...,annual reminder indias ayush minister promised...
3,Politics -- Source in comments,mp covid19 megathread,18,fpt2jw,https://www.reddit.com/r/india/comments/fpt2jw...,19.0,1970-01-01 00:00:01.585324794,thread sharing coronavirus news updates relate...,maardon_bhenji,found myfitnesspal pretty good indian food pac...,I found MyFitnessPal to be pretty good with I...,mp covid19 megathread found myfitnesspal prett...
4,Politics -- Source in comments,friends decades fallen fake news actively demo...,28,fvyodb,https://www.reddit.com/r/india/comments/fvyodb...,6.0,1970-01-01 00:00:01.586208234,govt india failed miserably protecting doctors...,in3po,found myfitnesspal pretty good indian food pac...,I found MyFitnessPal to be pretty good with I...,friends decades fallen fake news actively demo...


In [74]:
#data.fillna("",inplace = True)

In [75]:
labels = data.flair
features = data.feature_combine
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42, test_size=0.2)

class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(np.ravel(y_train.values)),
                                                 np.ravel(y_train.values))

In [76]:
flairs = np.unique(np.ravel(y_train.values))

In [77]:
def one_vector(label):
    temp = np.zeros(label.shape[0])
    for i in range(label.shape[0]):
        temp[i] = np.where(label[i] == 1)[0]
    return temp

In [78]:
def test_algorithms(algorithms, X_train, y_train, X_test, y_test):
    results = {}
    for algo in algorithms:
        algorithm = Pipeline([('vect', CountVectorizer()),
                             ('tfodf', TfidfTransformer()),
                             ('clf', algo)])
        print(algorithm)
        algorithm.fit(X_train, y_train)
        cv_scores = cross_val_score(algorithm, X_train, y_train, cv=5)
        print('cv_scores:',cv_scores)
        print('cv_scores mean:{}'.format(np.mean(cv_scores)))
        results[algorithm]= np.mean(cv_scores)
        y_pred = algorithm.predict(X_test)
        print(y_pred.shape)
        print(classification_report(y_test, y_pred))
    return results

In [79]:
dtc = DecisionTreeClassifier()
clf = SVC(C=0.9, kernel='rbf')
sgd = SGDClassifier(loss='hinge',
                   penalty = 'l2',
                   alpha = 1e-5,
                   max_iter=5, tol = None)
rfc = RandomForestClassifier(n_estimators=1000,
                            random_state=42)

gbc = GradientBoostingClassifier(n_estimators=1000,
                                learning_rate=0.001)

In [80]:
algos = [clf, dtc, rfc, sgd, gbc]
algos = [gbc]

#X_train = X_train.combined_features
#X_test = X_test.combined_features
res = test_algorithms(algos, X_train, y_train, X_test, y_test)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                            loss='deviance', max_depth=3,
                                            max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=



cv_scores: [0.64759036 0.68975904 0.71686747 0.67975831 0.69486405]
cv_scores mean:0.6857678447930695
(415,)
                                precision    recall  f1-score   support

                      AskIndia       0.95      0.76      0.84        25
              Business/Finance       0.75      0.50      0.60        24
                       CAA-NRC       0.94      0.71      0.81        24
                   CAA-NRC-NPR       0.71      0.86      0.77        14
                   Coronavirus       0.75      0.91      0.82        23
                Demonetization       0.94      0.70      0.80        23
                          Food       0.85      0.68      0.76        25
                 Non-Political       0.93      0.93      0.93        14
       Official Sadness Thread       0.00      0.00      0.00         0
                           Old       1.00      0.61      0.76        18
               Original Comics       1.00      0.60      0.75         5
                   Photogr

  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
algorithm = Pipeline([('vect', CountVectorizer()),
          ('tfodf', TfidfTransformer()),
          ('clf', gbc)])
print(algorithm)
algorithm.fit(X_train, y_train)
cv_scores = cross_val_score(algorithm, X_train, y_train, cv=5)
print('cv_scores:',cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                            loss='deviance', max_depth=3,
                                            max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=



cv_scores: [0.64759036 0.68975904 0.71385542 0.67975831 0.69486405]
cv_scores mean:0.6851654351545153


In [83]:
#results[algorithm]= np.mean(cv_scores)
y_pred = algorithm.predict(X_test)
print(y_pred.shape)
print(classification_report(y_test, y_pred))

(415,)
                                precision    recall  f1-score   support

                      AskIndia       0.95      0.72      0.82        25
              Business/Finance       0.75      0.50      0.60        24
                       CAA-NRC       0.89      0.71      0.79        24
                   CAA-NRC-NPR       0.71      0.86      0.77        14
                   Coronavirus       0.75      0.91      0.82        23
                Demonetization       0.94      0.65      0.77        23
                          Food       0.85      0.68      0.76        25
                 Non-Political       0.93      0.93      0.93        14
       Official Sadness Thread       0.00      0.00      0.00         0
                           Old       1.00      0.61      0.76        18
               Original Comics       1.00      0.60      0.75         5
                   Photography       0.88      1.00      0.93        14
              Policy & Economy       0.18      0.21     

  _warn_prf(average, modifier, msg_start, len(result))


In [84]:
import joblib

In [86]:
joblib.dump(algorithm, "gb_model.sav")

['gb_model.sav']