In [180]:
import sklearn
import pandas as pd
import numpy as np

In [191]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, auc, roc_curve
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import class_weight
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [182]:
data = pd.read_csv('cleaned_data.csv')

In [183]:
data.head()

Unnamed: 0,flair,title,score,id,url,comms_num,body,author,comments,timestamp,feature_combine
0,CAA-NRC,bjp digging grave caa nrc npr,753,eqbcz2,https://i.redd.it/48nytcnskgb41.jpg,102.0,,GPA9304,nope know cant good governance divide people e...,2020-01-18 17:13:51,bjp digging grave caa nrc npr nope know cant g...
1,CAA-NRC,spotted anti caa nrc rally dunkin donuts berli...,735,eqhfw3,https://i.redd.it/uzdly44tyjb41.jpg,68.0,,Slimshady9829,genuine question matter people protest foreign...,2020-01-19 04:37:40,spotted anti caa nrc rally dunkin donuts berli...
2,CAA-NRC,cousin strongly supports caa nrc unable explai...,7,fceyh5,https://www.reddit.com/r/india/comments/fceyh5...,12.0,cousin india strongly supports caanrc trying u...,ultimatereal,know might downvoted saying worth arguing spoi...,2020-03-03 06:22:10,cousin strongly supports caa nrc unable explai...
3,CAA-NRC,15 lakh 150 000 people protest peacefully #caa...,647,eei8rn,https://twitter.com/the_bongrel/status/1209028...,24.0,,in3po,seriously think bjp might pulled goal muslims ...,2019-12-23 22:39:17,15 lakh 150 000 people protest peacefully #caa...
4,CAA-NRC,people eliminated caa nrc process wont get cit...,181,eivymz,https://www.reddit.com/r/india/comments/eivymz...,19.0,common misconception seeing making rounds get ...,entirepolscience,best explanation caanrc seen https wwwyoutubec...,2020-01-02 22:37:22,people eliminated caa nrc process wont get cit...


In [184]:
#data.fillna("",inplace = True)

In [185]:
labels = data.flair
features = data.feature_combine
X_train, X_test, y_train, y_test = train_test_split(features, labels, random_state=42, test_size=0.2)

class_weights = class_weight.compute_class_weight('balanced',
                                                  np.unique(np.ravel(y_train.values)),
                                                 np.ravel(y_train.values))

In [186]:
flairs = np.unique(np.ravel(y_train.values))

In [187]:
def one_vector(label):
    temp = np.zeros(label.shape[0])
    for i in range(label.shape[0]):
        temp[i] = np.where(label[i] == 1)[0]
    return temp

In [188]:
def test_algorithms(algorithms, X_train, y_train, X_test, y_test):
    results = {}
    for algo in algorithms:
        algorithm = Pipeline([('vect', CountVectorizer()),
                             ('tfodf', TfidfTransformer()),
                             ('clf', algo)])
        print(algorithm)
        algorithm.fit(X_train, y_train)
        cv_scores = cross_val_score(algorithm, X_train, y_train, cv=5)
        print('cv_scores:',cv_scores)
        print('cv_scores mean:{}'.format(np.mean(cv_scores)))
        results[algorithm]= np.mean(cv_scores)
        y_pred = algorithm.predict(X_test)
        print(y_pred.shape)
        print(classification_report(y_test, y_pred))
    return results

In [192]:
dtc = DecisionTreeClassifier()
clf = SVC(C=0.9, kernel='rbf')
sgd = SGDClassifier(loss='hinge',
                   penalty = 'l2')
rfc = RandomForestClassifier(n_estimators=1000,
                            random_state=42)

gbc = GradientBoostingClassifier(n_estimators=150,
                                learning_rate=0.2,
                                min_samples_leaf=3,
                               max_depth = 4,
                                 subsample = 0.5)

In [193]:
algos = [clf, dtc, rfc, sgd, gbc]
#X_train = X_train.combined_features
#X_test = X_test.combined_features
res = test_algorithms(algos, X_train, y_train, X_test, y_test)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfodf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=0.9, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,

cv_scores: [0.56410256 0.58404558 0.57692308 0.56348074 0.54208274]
cv_scores mean:0.5661269411626044
(878,)
                                precision    recall  f1-score   support

                  Announcement       0.67      0.53      0.59        49
                      AskIndia       0.56      0.53      0.54        38
              Business/Finance       0.40      0.51      0.45        57
                       CAA-NRC       0.74      0.77      0.75        56
                   CAA-NRC-NPR       0.50      0.33      0.40        18
                   Coronavirus       0.82      0.80      0.81        46
                Demonetization       0.79      0.84      0.81        44
                          Food       0.75      0.78      0.77        51
                 Non-Political       0.83      0.68      0.75        44
                           Old       0.65      0.67      0.66        42
                   Photography       0.87      0.81      0.84        42
              Policy & Eco

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
X_train.shape

In [162]:
y_train.shape

(1788,)

In [123]:
X_test

Unnamed: 0,title,score,id,url,comms_num,body,author,comments,timestamp,feature_combine
2776,hi r india rajeev chandrasekhar member rajya s...,315,387req,https://www.reddit.com/r/india/comments/387req...,173.0,rajeev chandrasekhar second time independent m...,rajeev_mp,hasnt indias definition broadband increased us...,2015-06-03 03:02:24,hi r india rajeev chandrasekhar member rajya s...
3022,serious seeking advice r india since places fo...,48,5rq4gj,https://www.reddit.com/r/india/comments/5rq4gj...,26.0,hello someone wanted research living physics s...,advic3s33k3r,cant really help first question regarding rd p...,2017-02-03 12:22:41,serious seeking advice r india since places fo...
2606,modi govt economic policies clueless years ind...,297,fll5sk,https://www.reddit.com/r/india/comments/fll5sk...,64.0,govt clueless economy even corona recession go...,ShortTesla_Rekt5,waiting see many bhaktas still support shit go...,2020-03-20 13:55:32,modi govt economic policies clueless years ind...
912,eve winter session parliament lets run interes...,0,3u8lyh,https://www.reddit.com/r/india/comments/3u8lyh...,16.0,lets go earlier parliament disruptions ones co...,MyselfWalrus,understand bitterness dead sessions wont chang...,2015-11-26 06:58:37,eve winter session parliament lets run interes...
877,dont discuss important stuff like,185,2vinkp,https://www.reddit.com/r/india/comments/2vinkp...,96.0,government start ranking states ease business ...,RajaRajaC,mind could hate modi bjp love modi bjp policie...,2015-02-11 21:49:09,dont discuss important stuff like mind could h...
151,caa nrc protestors wankhede stadium,45,eoma5w,https://mobile.twitter.com/desimojito/status/1...,2.0,,bitterpopsicle,fuck modi lol shit doctored p literally one ma...,2020-01-15 04:24:54,caa nrc protestors wankhede stadium fuck modi ...
3207,r india pets animals thread,42,dca1dn,https://www.reddit.com/r/india/comments/dca1dn...,36.0,since lot interest want scheduled threads post...,rorschach34,hankie streetdog adopted almost two years ago ...,2019-10-03 02:43:57,r india pets animals thread hankie streetdog a...
3762,askindia need advice buying new bike,17,by7svp,https://www.reddit.com/r/india/comments/by7svp...,24.0,hey guysi looking buy new bike ride city work ...,nagasadhu,budget 16 lakh specifically wrote number randi...,2019-06-09 02:37:57,askindia need advice buying new bike budget 16...
2722,time modi government stopped blaming rbi growt...,91,7ics2u,http://www.livemint.com/Opinion/vGWfsIhHzOflbZ...,10.0,,bulcfe,modi stopped blaming someone fuck smoking bro ...,2017-12-08 18:50:28,time modi government stopped blaming rbi growt...
3653,india independence,3,6t9ik5,https://www.reddit.com/r/india/comments/6t9ik5...,0.0,ask india indian born bred explained regular i...,girlwiththewildlocks,,2017-08-13 06:23:00,india independence nan https://www.reddit.com/...
