In [20]:
import pandas as pd
import numpy as np
reddit_data = pd.read_csv('Final_formatted_data.csv')
reddit_data.head()
flairs = ["AskIndia", "Non-Political", "[R]eddiquette", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "AMA"]

# Importing varied Scikit-learn ML models

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Training and testing model keeping Title as feature

In [22]:
X = reddit_data['title'] #Dependent variable
y = reddit_data['flair'] #Independent variable
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state = 42)

In [23]:
# Naive Bayse Classifier

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

NB = nb.fit(X_train, y_train)
#pickle.dump(NB,open("model_NB.sav",'wb'))
y_pred = nb.predict(X_test)


print(f"NB accuracy {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

NB accuracy 0.475
                    precision    recall  f1-score   support

          AskIndia       0.79      0.92      0.85        12
     Non-Political       0.45      0.31      0.37        16
     [R]eddiquette       0.33      0.38      0.35         8
         Scheduled       0.62      0.31      0.42        16
       Photography       0.27      0.33      0.30         9
Science/Technology       0.78      0.78      0.78         9
          Politics       0.18      0.25      0.21         8
  Business/Finance       0.06      0.17      0.09         6
    Policy/Economy       0.91      1.00      0.95        10
            Sports       0.57      0.44      0.50         9
              Food       0.62      0.71      0.67         7
               AMA       0.20      0.10      0.13        10

          accuracy                           0.48       120
         macro avg       0.48      0.48      0.47       120
      weighted avg       0.51      0.47      0.48       120



In [25]:
#SGD
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
SGD = sgd.fit(X_train, y_train)
#pickle.dump(SGD,open("model_SGC.sav",'wb'))
y_pred = sgd.predict(X_test)

print(f"SGD accuracy {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

SGD accuracy 0.4583333333333333
                    precision    recall  f1-score   support

          AskIndia       0.80      1.00      0.89        12
     Non-Political       0.45      0.31      0.37        16
     [R]eddiquette       0.17      0.12      0.14         8
         Scheduled       0.54      0.44      0.48        16
       Photography       0.33      0.33      0.33         9
Science/Technology       0.47      0.78      0.58         9
          Politics       0.17      0.25      0.20         8
  Business/Finance       0.00      0.00      0.00         6
    Policy/Economy       0.83      1.00      0.91        10
            Sports       0.44      0.44      0.44         9
              Food       0.80      0.57      0.67         7
               AMA       0.00      0.00      0.00        10

          accuracy                           0.46       120
         macro avg       0.42      0.44      0.42       120
      weighted avg       0.44      0.46      0.44       120



In [26]:
#Logistic Regression
logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
LOGREG = logreg.fit(X_train, y_train)
#pickle.dump(LOGREG,open("model_LOGREG.sav",'wb'))
y_pred = logreg.predict(X_test)

print(f"LOG accuracy {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

LOG accuracy % 0.475
                    precision    recall  f1-score   support

          AskIndia       0.92      0.92      0.92        12
     Non-Political       0.46      0.38      0.41        16
     [R]eddiquette       0.25      0.25      0.25         8
         Scheduled       0.67      0.50      0.57        16
       Photography       0.30      0.33      0.32         9
Science/Technology       0.50      0.67      0.57         9
          Politics       0.15      0.25      0.19         8
  Business/Finance       0.00      0.00      0.00         6
    Policy/Economy       0.91      1.00      0.95        10
            Sports       0.50      0.56      0.53         9
              Food       0.50      0.57      0.53         7
               AMA       0.00      0.00      0.00        10

          accuracy                           0.48       120
         macro avg       0.43      0.45      0.44       120
      weighted avg       0.47      0.47      0.47       120



