In [4]:
import praw
import pandas as pd
import datetime as dt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
import nltk
import pickle


In [2]:
sns.set(color_codes=True)
reddit = praw.Reddit(client_id='NVyf-I0I6atu2w',\
                     client_secret='Sn1bZ2ID-P5X6hFV-mjt09v0j5c',\
                     user_agent='Testing_api', \
                     username='reddit_testbyayush', \
                     password='password')

In [None]:
loaded_model = pickle.load(open('finalized_model.sav', 'rb'))
subreddit = reddit.subreddit('india')
top_subreddit = subreddit.top()
top_subreddit = subreddit.top(limit=500)

In [None]:
for flair in flairs:
    get_subreddits = subreddit.search(flair, limit=500)

for submission in subreddit.top(limit=5):
    print(submission.title, submission.url)

topics_dict = {"flair":[],  "title":[],  "score":[],  "id":[],  "url":[],  "comms_num": [],  "created": [], "body":[]}

for submission in top_subreddit:
    topics_dict["flair"].append(flair)
    topics_dict["title"].append(submission.title)
    topics_dict["score"].append(submission.score)
    topics_dict["id"].append(submission.id)
    topics_dict["url"].append(submission.url)
    topics_dict["comms_num"].append(submission.num_comments)
    topics_dict["created"].append(submission.created)
    topics_dict["body"].append(submission.selftext)
topics_data = pd.DataFrame(topics_dict)
print(topics_data)

In [None]:
def get_date(created):
    return dt.datetime.fromtimestamp(created)
_timestamp = topics_data["created"].apply(get_date)
topics_data = topics_data.assign(timestamp = _timestamp)
print(topics_data)
topics_data.to_csv('reddit-india-data.csv', index=False) 

In [None]:
def data_analysis(input_file):
    df = pd.read_csv(input_file, header = 0)
    print(df.head())
    print(df.count())
    print(df.shape)
    duplicate_rows_df = df[df.duplicated()]
    print("number of duplicate rows:", duplicate_rows_df.shape)
    df = df.drop_duplicates()
    print(df.head())
    print(df.count())
    print(df.isnull().sum())
    # Dropping the missing values.
    df = df.dropna() 
    print(df.count())
    sns.boxplot(x=df["comms_num"])
    # Plotting a scatter plot
    fig, ax = plt.subplots(figsize=(10,6))
    ax.scatter(df["id"],df["comms_num"])
    ax.set_xlabel("id")
    ax.set_ylabel("comms_num")
    plt.show()
    # Plotting a Histogram
    df.comms_num.value_counts().nlargest().plot(kind='bar', figsize=(10,5))
    plt.title("Comments vs id")
    plt.ylabel("Number of comments")
    plt.xlabel("id")
    plt.show()

In [None]:
def train_test(X,y):
 
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 42)

  print("Results of Naive Bayes Classifier")
  nb_classifier(X_train, X_test, y_train, y_test)
  print("Results of Random Forest")
  randomforest(X_train, X_test, y_train, y_test)
  print("Results of MLP Classifier")
  mlpclassifier(X_train, X_test, y_train, y_test)

In [None]:
def nb_classifier(X_train, X_test, y_train, y_test):
  
  from sklearn.naive_bayes import MultinomialNB


  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy is %s' % accuracy_score(y_pred, y_test),'%')

In [None]:
def randomforest(X_train, X_test, y_train, y_test):
  
  from sklearn.ensemble import RandomForestClassifier
  
  ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  ranfor.fit(X_train, y_train)

  y_pred = ranfor.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))


In [None]:
def mlpclassifier(X_train, X_test, y_train, y_test):
  
  from sklearn.neural_network import MLPClassifier
  
  mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
  mlp.fit(X_train, y_train)

  y_pred = mlp.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))

In [None]:
def detect_flair(id):
    submission = reddit.submission(id=id)
    topics_dict['combine'] = submission.title + submission.url
    return loaded_model.predict([topics_dict['combine']])

In [None]:
def find_flair(id):
    
    submission = reddit.submission(id=id)
    url = input("Enter the url of the article whose flare you want to find:")
    return loaded_model.predict([url])

In [None]:
file_name = "reddit-india-data.csv"
data_analysis(file_name)

X = topics_data.title
Y = topics_data.body
Z = topics_data.url
F = topics_data.flair
print("Flair Detection using Title as Feature")
train_test(X,F)
print("Flair Detection using Body as Feature")
train_test(Y,F)
print("Flair Detection using URL as Feature")
train_test(Z,F)

subreddit = reddit.subreddit('india')
print('The flares of first five headings are:')
for submission in subreddit.top(limit=10):
    print(detect_flair(submission.id))

for submission in subreddit.top(limit=1):
    print(find_flair(submission.id))
