In [1]:
import pandas as pd
import numpy as np
reddit_data = pd.read_csv('Final_formatted_data.csv')
reddit_data.head()
flairs = ["AskIndia", "Non-Political", "[R]eddiquette", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "AMA"]

# Importing varied Scikit-learn ML models

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Functions for various models.

In [14]:
# Naive Bayse Classifier
def naiveBayse(X_train, X_test, y_train, y_test):
    nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

    nb.fit(X_train, y_train)
    y_pred = nb.predict(X_test)
    print(f"NB Accuracy: {accuracy_score(y_pred, y_test)}")

In [15]:
#Logistic Regression
def logReg(X_train, X_test, y_train, y_test):
    logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1, C=1e5)),
               ])
    logreg.fit(X_train, y_train)
    y_pred = logreg.predict(X_test)
    print(f"Logistic Regression Accuracy: {accuracy_score(y_pred, y_test)}")

In [16]:
#Linear Support Vector Machine 
def svmLinear(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
    sgd.fit(X_train, y_train)
    y_pred = sgd.predict(X_test)
    print(f"Linear SVM Accuracy: {accuracy_score(y_pred, y_test)}")

In [17]:
#Random Forest
def randomForest(X_train, X_test, y_train, y_test):
    rf = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                  ])
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    print(f"Random Forest accuracy: {accuracy_score(y_pred, y_test)}")

# Training, Testing and Printing Accuracy for each model.

In [18]:
y = reddit_data['flair'] #Independent variable
def train_test(X):
    X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state = 42)
    naiveBayse(X_train, X_test, y_train, y_test)
    logReg(X_train, X_test, y_train, y_test)
    svmLinear(X_train, X_test, y_train, y_test)
    randomForest(X_train, X_test, y_train, y_test)

# Changing value of Dependent variable and calling train_test() 

In [20]:
print("Flair detection accuracy using TITLE as feature\n")
train_test(reddit_data['title'])
print("Flair detection accuracy using URL as feature\n")
train_test(reddit_data['url'])
print("Flair detection accuracy using Features Group(title + url) as feature\n")
train_test(reddit_data['feature_Grp'])

Flair detection accuracy using TITLE as feature

NB Accuracy: 0.475
Logistic Regression Accuracy: 0.475
Linear SVM Accuracy: 0.4583333333333333




Random Forest accuracy % 0.4666666666666667
Flair detection accuracy using URL as feature

NB Accuracy: 0.25833333333333336
Logistic Regression Accuracy: 0.3416666666666667
Linear SVM Accuracy: 0.31666666666666665




Random Forest accuracy % 0.20833333333333334
Flair detection accuracy using Features Group(title + url) as feature

NB Accuracy: 0.45
Logistic Regression Accuracy: 0.5083333333333333
Linear SVM Accuracy: 0.5333333333333333




Random Forest accuracy % 0.425


# Grouped Features using Linear SVM Model provides most accuracy. 

In [21]:
#Linear SVM Accuracy: 0.5333333333333333

# Saving model using Pickle library 

In [23]:
import pickle
X = reddit_data['feature_Grp']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.1,random_state = 42)
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
sgd.fit(X_train, y_train)
y_pred = sgd.predict(X_test)
pickle.dump(sgd.fit(X_train, y_train), open("model_linear_svm.sav", 'wb'))