In [1]:
import requests
import praw
import datetime
from dateutil.relativedelta import relativedelta
import concurrent.futures
import pandas as pd

## Data Collection

In [2]:
reddit = praw.Reddit("bot1")

In [13]:
def get_post_data_for_range(start_date, end_date, subreddit):
    query = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=500&before={end_date}&after={start_date}'
    post_data = requests.get(query)
    return post_data.json()['data']
    
def filter_submission(total_submission_dict, post):
    submission = reddit.submission(id=post['id'])
    if (submission.link_flair_text is not None):
        total_submission_dict['date'].append(post['created_utc'])
        total_submission_dict['id'].append(post['id'])
        total_submission_dict['flair'].append(submission.link_flair_text)
        total_submission_dict['title'].append(post['title'])
        total_submission_dict['selftext'].append(post['selftext'])

def get_post_data(start_date, end_date, subreddit):
    total_submission_dict = {
        'date': [], 
        'id': [],
        'flair': [],
        'title': [],
        'selftext': []
    }
    
    post_data = get_post_data_for_range(start_date, end_date, subreddit)
    for x in range (5):
        for submission in post_data:
            filter_submission(total_submission_dict, submission)

        start_date = post_data[-1]['created_utc']
        post_data = get_post_data_for_range(start_date, end_date, subreddit)
    return total_submission_dict
    

# Gadgets Post Classification

In [41]:
start_date = ((datetime.datetime.utcnow() - relativedelta(years=1)))
dates = [int((start_date + relativedelta(months=x)).timestamp()) for x in range(12)]
with concurrent.futures.ThreadPoolExecutor() as executor:
    threads= [executor.submit(get_post_data, dates[month], dates[month+1], 'gadgets') for month in range(0,len(dates),2)]
    post_data_per_month = [thread.result() for thread in threads]

In [None]:
start_date = ((datetime.datetime.utcnow() - relativedelta(days=15))).timestamp()
end_date = datetime.datetime.utcnow().timestamp()
last_15_days_data = get_post_data(1604881881, 1606100329, 'gadgets')

In [None]:
df = pd.DataFrame(last_15_days_data)
df['flair'].value_counts()

In [298]:
df = pd.read_csv('gadges_posts.csv')
df.drop(['selftext'], axis=1, inplace=True)
df.dropna(inplace=True)
df['flair'].value_counts()

VR / AR                 97
Phones                  30
Desktops / Laptops      23
Computer peripherals    14
Medical                 11
Transportation          11
Wearables               11
Home                    10
Gaming                  10
Discussion               9
Music                    7
Watches                  6
Rule 2                   5
TV / Projectors          5
Cameras                  4
Misc                     4
Homemade                 3
Not A Gadget             2
Phone Accessories        2
locked                   1
Rule 5                   1
*SMS-based               1
Drones / UAVs            1
Blogspam                 1
Tablets                  1
Name: flair, dtype: int64

## Data Cleansing

In [299]:
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer
import regex as re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('words')
from nltk.corpus import words

Word = list(set(words.words()))

df = df[df['title'].str.contains('|'.join(Word))]

df.head()

[nltk_data] Downloading package words to C:\Users\Aman
[nltk_data]     Riat\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Unnamed: 0,date,id,flair,title
0,1604891928,jqpvp6,Not A Gadget,Maintenance Robot Walks on a Wind Turbine's Bl...
1,1604896733,jqr3bg,Rule 2,"Back here with, affordable smart watches in th..."
2,1604897235,jqr7io,Music,iFi NEO iDSD: New HiRes Professional Headphone...
3,1604899565,jqrr2a,Phones,moto G8 power lite launched in India | Specifi...
4,1604900214,jqrwa4,Phones,moto G8 power lite launched in India | Specifi...


In [300]:
targets= list(df['flair'].value_counts()[0:5].index)
target_df = df[df['flair'].isin(targets)]
non_target_df = df[~df['flair'].isin(targets)]

In [303]:
#Flair with value 5 means other
from sklearn.preprocessing import LabelEncoder  

le = LabelEncoder()
target_df.loc[:,'flair'] = le.fit_transform(target_df['flair'])
non_target_df.loc[:, 'flair'] = 5
le.inverse_transform([0,1,2,3,4])

array([0, 1, 2, 3, 4], dtype=int64)

In [304]:
#Comparing data split
print(len(target_df), len(non_target_df))

173 91


In [305]:
df = pd.concat([target_df, non_target_df], axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

In [306]:
#Tokenize the titles
def tokenize(x):
    if not x:
        x = ''
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(x)
df.loc[:,'tokens'] = target_df['title'].map(tokenize)

In [307]:
df.dropna(inplace=True)

In [308]:
#Lemming and stemming to reduce similar based words to a common word
nltk.download('wordnet')
def stemmer(x):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in x])
 
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])
df['lemma'] = df['tokens'].map(lemmatize)
df['stems'] = df['tokens'].map(stemmer)

[nltk_data] Downloading package wordnet to C:\Users\Aman
[nltk_data]     Riat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Model Training

In [309]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [310]:
X = df['lemma']
y = df['flair']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13)

In [311]:
mnb_pipe = Pipeline(steps = [('tf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [312]:
param_grid = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__stop_words' : ['english', None],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'mnb__alpha' : [0.1, 0.5, 1]
}

In [313]:
gscv_mnb = GridSearchCV(pipe_mnnb, param_grid,cv=5,n_jobs=-1)
gscv_mnb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': [0.1, 0.5, 1],
                         'tf__max_features': [1000, 2000, 3000],
                         'tf__ngram_range': [(1, 1), (1, 2)],
                         'tf__stop_words': ['english', None],
                         'tf__use_idf': [True, False]})

In [314]:
y_pred = gs_mnnb.predict(X_test)
gscv_mnb.score(X_test, y_test)

0.8181818181818182

In [315]:
gscv_mnb.best_params_

{'mnb__alpha': 0.1,
 'tf__max_features': 1000,
 'tf__ngram_range': (1, 1),
 'tf__stop_words': 'english',
 'tf__use_idf': False}

In [316]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 0,  2,  0,  0,  1],
       [ 0,  3,  0,  0,  0],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  0,  5,  2],
       [ 0,  0,  0,  0, 28]], dtype=int64)

# Geegees Post Classification