## Importing Libraries

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
import sklearn
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

## Loading the pre-processed data

In [2]:
data = pd.read_csv("pre_processed_data_with_top_comment_latest.csv")

In [3]:
flairs = ["Non-Political", "Scheduled", "AskIndia", "Science/Technology", "Politics",
          "Business/Finance", "Policy/Economy", "Sports", "Food"]

## Checking for Nan Title, and if found, removing it

In [4]:
null_title = pd.isnull(data['title'])
data[null_title]

Unnamed: 0,flair,title,url,comms_num,body,author,comments
2060,Food,,https://www.reddit.com/r/india/comments/fx94c7...,18,iam student studying visakhapatnam 3 day lockd...,lazyyyyy1yyyyy1,im sure help maybe call police ask help dont f...


In [5]:
data = data.drop([2060], axis = 0)

In [6]:
null_title = pd.isnull(data['title'])
data[null_title]

Unnamed: 0,flair,title,url,comms_num,body,author,comments


## Multinomial Naive Bayes Model

In [7]:
def mnb(X_train, X_test, y_train, y_test):
  
  from sklearn.naive_bayes import MultinomialNB

  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

def train_test_mnb(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42)

  print("Results of Naive Bayes Classifier")
  mnb(X_train, X_test, y_train, y_test)
    
train_test_mnb(data.title, data.flair)    

Results of Naive Bayes Classifier
accuracy 0.6474358974358975
                    precision    recall  f1-score   support

     Non-Political       0.72      0.67      0.70        43
         Scheduled       0.61      0.49      0.54        39
          AskIndia       0.72      0.89      0.80        38
Science/Technology       0.63      0.76      0.69        29
          Politics       0.50      0.38      0.43        32
  Business/Finance       0.50      0.69      0.58        32
    Policy/Economy       0.74      0.60      0.67        43
            Sports       0.61      0.68      0.64        25
              Food       0.75      0.68      0.71        31

          accuracy                           0.65       312
         macro avg       0.64      0.65      0.64       312
      weighted avg       0.65      0.65      0.64       312



#### MNB received accuracy of 64.74%

## Support Vector Machine Model

In [8]:
def lsvm(X_train, X_test, y_train, y_test):
  
  from sklearn.linear_model import SGDClassifier

  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=0.001, random_state=42, max_iter=10, tol=None)),
                 ])
  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

def train_test_lsvm(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42)

  print("Results of LSVM Classifier")
  lsvm(X_train, X_test, y_train, y_test)
    
train_test_lsvm(data.title, data.flair)  

Results of LSVM Classifier
accuracy 0.6858974358974359
                    precision    recall  f1-score   support

     Non-Political       0.79      0.72      0.76        43
         Scheduled       0.68      0.49      0.57        39
          AskIndia       0.77      0.89      0.83        38
Science/Technology       0.74      0.90      0.81        29
          Politics       0.54      0.47      0.50        32
  Business/Finance       0.56      0.72      0.63        32
    Policy/Economy       0.72      0.65      0.68        43
            Sports       0.56      0.72      0.63        25
              Food       0.77      0.65      0.70        31

          accuracy                           0.69       312
         macro avg       0.68      0.69      0.68       312
      weighted avg       0.69      0.69      0.68       312



#### LSVM received accuracy of 68.58%

## Applying GridSearch to LSVM

In [11]:
def lsvm(X_train, X_test, y_train, y_test):
    
    from sklearn.model_selection import GridSearchCV
    from sklearn.linear_model import SGDClassifier
    
    param_grid = {'clf__max_iter': [1, 5, 10, 20],
              'clf__alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
              'clf__penalty': ["none", "l1", "l2"],
              'clf__loss' : ["hinge"]}
    
    pipeline = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', SGDClassifier()),
                        ])
    
    classifier = GridSearchCV(pipeline, param_grid = param_grid)
    classifier.fit(X_train, y_train)

    y_pred = classifier.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    print("Best Params: ", classifier.best_params_)

def train_test_lsvm(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42)
    print("Results of Linear SVM")
    lsvm(X_train, X_test, y_train, y_test)
    
train_test_lsvm(data.title, data.flair)  

Results of Linear SVM












accuracy 0.6923076923076923
                    precision    recall  f1-score   support

     Non-Political       0.84      0.72      0.77        43
         Scheduled       0.69      0.51      0.59        39
          AskIndia       0.77      0.89      0.83        38
Science/Technology       0.73      0.93      0.82        29
          Politics       0.60      0.47      0.53        32
  Business/Finance       0.59      0.72      0.65        32
    Policy/Economy       0.69      0.63      0.66        43
            Sports       0.53      0.72      0.61        25
              Food       0.75      0.68      0.71        31

          accuracy                           0.69       312
         macro avg       0.69      0.70      0.69       312
      weighted avg       0.70      0.69      0.69       312

Best Params:  {'clf__alpha': 0.001, 'clf__loss': 'hinge', 'clf__max_iter': 5, 'clf__penalty': 'l2'}




#### LSVM received accuracy of 69.23% after GridSearch for parameters {'clf__alpha': 0.001, 'clf__loss': 'hinge', 'clf__max_iter': 5, 'clf__penalty': 'l2'}

## Logistic Regression Model

In [10]:
def logisticreg(X_train, X_test, y_train, y_test):
    from sklearn.linear_model import LogisticRegression

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))
    
def train_test_logisticreg(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state = 42)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    
train_test_logisticreg(data.title, data.flair)    

Results of Logistic Regression
accuracy 0.6474358974358975
                    precision    recall  f1-score   support

     Non-Political       0.86      0.74      0.80        43
         Scheduled       0.49      0.49      0.49        39
          AskIndia       0.76      0.84      0.80        38
Science/Technology       0.92      0.83      0.87        29
          Politics       0.38      0.41      0.39        32
  Business/Finance       0.58      0.59      0.58        32
    Policy/Economy       0.71      0.56      0.62        43
            Sports       0.51      0.72      0.60        25
              Food       0.66      0.68      0.67        31

          accuracy                           0.65       312
         macro avg       0.65      0.65      0.65       312
      weighted avg       0.66      0.65      0.65       312





#### Logistic Regression received accuracy of 64.74%

## Random Forest Model

In [11]:
def randomforest(X_train, X_test, y_train, y_test):
  
  from sklearn.ensemble import RandomForestClassifier
  
  rf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 2000, max_depth = 70, max_features = 'auto', 
                   bootstrap = True, random_state = 42)),
                 ])
  rf.fit(X_train, y_train)

  y_pred = rf.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))   
    
def train_test_rf(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 42)

  print("Results of Random Forest")
  randomforest(X_train, X_test, y_train, y_test)
    
train_test_rf(data.title, data.flair) 

Results of Random Forest
accuracy 0.6850961538461539
                    precision    recall  f1-score   support

     Non-Political       1.00      0.70      0.82        50
         Scheduled       0.52      0.69      0.60        49
          AskIndia       0.85      0.85      0.85        48
Science/Technology       0.97      0.91      0.94        43
          Politics       0.47      0.56      0.51        43
  Business/Finance       0.68      0.64      0.66        47
    Policy/Economy       0.71      0.58      0.64        55
            Sports       0.51      0.64      0.57        36
              Food       0.63      0.60      0.61        45

          accuracy                           0.69       416
         macro avg       0.71      0.69      0.69       416
      weighted avg       0.71      0.69      0.69       416



#### Random Forest received accuracy of 68.50%.