## Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
import sklearn
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import joblib

## Loading the pre-processed data

In [2]:
data = pd.read_csv("pre_processed_data_with_top_comment_latest.csv")

In [3]:
flairs = ["Non-Political", "Scheduled", "AskIndia", "Science/Technology", "Politics",
          "Business/Finance", "Policy/Economy", "Sports", "Food"]

## Creating a column for combined text of Title and Comments

In [4]:
all_data = data["title"].fillna('') + data["comments"].fillna('')
data = data.assign(all_data = all_data)
data.head()

Unnamed: 0,flair,title,url,comms_num,body,author,comments,all_data
0,Non-Political,indian state ut renamed country similar popula...,https://i.redd.it/hk9o11b8dun41.png,20,,schadenfeuder,nice name uttar pradesh ajay bisht gonna happy...,indian state ut renamed country similar popula...
1,Non-Political,hotstar finally uploads sunday last week tonig...,https://www.hotstar.com/in/tv/last-week-tonigh...,12,,TimeVendor,first turning internet entire state refusing b...,hotstar finally uploads sunday last week tonig...
2,Non-Political,best nonpolitical stand comedian,https://www.reddit.com/r/india/comments/g3zbrt...,9,thing related politics give anxiety recommend ...,daredevil005,abhishek upmanyu kenny sebastian biswa anubhav...,best nonpolitical stand comedianabhishek upman...
3,Non-Political,icse isc exam postponed due covid19 nonpolitical,https://cisce.org//UploadedFiles/PDF/COVID%201...,0,,DSMalhotra,,icse isc exam postponed due covid19 nonpolitical
4,Non-Political,nonpolitical friend wrote first book week givi...,https://www.reddit.com/r/india/comments/ezqw6i...,0,friend mine wrote first book music men volume ...,bitswreck,,nonpolitical friend wrote first book week givi...


## Multinomial Naive Bayes Model

In [5]:
def mnb(X_train, X_test, y_train, y_test):
  
  from sklearn.naive_bayes import MultinomialNB

  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

def train_test_mnb(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

  print("Results of Naive Bayes Classifier")
  mnb(X_train, X_test, y_train, y_test)
    
train_test_mnb(data.all_data, data.flair)    

Results of Naive Bayes Classifier
accuracy 0.6274038461538461
                    precision    recall  f1-score   support

     Non-Political       0.62      0.55      0.58        51
         Scheduled       0.44      0.76      0.55        45
          AskIndia       0.62      0.81      0.71        43
Science/Technology       0.89      0.37      0.52        43
          Politics       0.62      0.49      0.55        53
  Business/Finance       0.56      0.80      0.66        46
    Policy/Economy       0.79      0.48      0.60        54
            Sports       0.67      0.72      0.69        39
              Food       0.86      0.74      0.79        42

          accuracy                           0.63       416
         macro avg       0.67      0.64      0.63       416
      weighted avg       0.67      0.63      0.62       416



#### MNB received accuracy of 62.74%

## Support Vector Machine Model

In [6]:
def lsvm(X_train, X_test, y_train, y_test):
  
  from sklearn.linear_model import SGDClassifier

  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=0.001, random_state=42, max_iter=10, tol=None)),
                 ])
  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

def train_test_lsvm(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

  print("Results of LSVM Classifier")
  lsvm(X_train, X_test, y_train, y_test)
    
train_test_lsvm(data.all_data, data.flair)  

Results of LSVM Classifier
accuracy 0.6850961538461539
                    precision    recall  f1-score   support

     Non-Political       0.70      0.69      0.69        51
         Scheduled       0.61      0.60      0.61        45
          AskIndia       0.75      0.77      0.76        43
Science/Technology       0.74      0.65      0.69        43
          Politics       0.65      0.60      0.63        53
  Business/Finance       0.54      0.67      0.60        46
    Policy/Economy       0.80      0.69      0.74        54
            Sports       0.67      0.74      0.71        39
              Food       0.73      0.79      0.76        42

          accuracy                           0.69       416
         macro avg       0.69      0.69      0.69       416
      weighted avg       0.69      0.69      0.69       416



#### LSVM received accuracy of 68.50%

## Logistic Regression Model

In [7]:
def logisticreg(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
def train_test_logisticreg(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    
train_test_logisticreg(data.all_data, data.flair)    

Results of Logistic Regression




accuracy 0.6706730769230769
                    precision    recall  f1-score   support

     Non-Political       0.61      0.69      0.65        51
         Scheduled       0.55      0.62      0.58        45
          AskIndia       0.82      0.74      0.78        43
Science/Technology       0.79      0.60      0.68        43
          Politics       0.61      0.58      0.60        53
  Business/Finance       0.56      0.70      0.62        46
    Policy/Economy       0.68      0.63      0.65        54
            Sports       0.74      0.74      0.74        39
              Food       0.82      0.76      0.79        42

          accuracy                           0.67       416
         macro avg       0.69      0.67      0.68       416
      weighted avg       0.68      0.67      0.67       416



#### Logistic Regression received accuracy of 67.06%

## Random Forest Model

In [8]:
def randomforest(X_train, X_test, y_train, y_test):
  
  from sklearn.ensemble import RandomForestClassifier
  
  rf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 2000, max_depth = 70, max_features = 'auto', 
                   bootstrap = True, random_state = 42)),
                 ])
  rf.fit(X_train, y_train)

  y_pred = rf.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))   
    
def train_test_rf(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

  print("Results of Random Forest")
  randomforest(X_train, X_test, y_train, y_test)
    
train_test_rf(data.all_data, data.flair) 

Results of Random Forest
accuracy 0.7091346153846154
                    precision    recall  f1-score   support

     Non-Political       0.77      0.71      0.73        51
         Scheduled       0.69      0.64      0.67        45
          AskIndia       0.77      0.77      0.77        43
Science/Technology       0.80      0.77      0.79        43
          Politics       0.68      0.57      0.62        53
  Business/Finance       0.54      0.80      0.65        46
    Policy/Economy       0.78      0.65      0.71        54
            Sports       0.76      0.74      0.75        39
              Food       0.69      0.79      0.73        42

          accuracy                           0.71       416
         macro avg       0.72      0.71      0.71       416
      weighted avg       0.72      0.71      0.71       416



#### Random Forest received accuracy of 70.91%