In [114]:
import requests
import praw
import datetime
from dateutil.relativedelta import relativedelta
import concurrent.futures
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

## Data Collection

In [3]:
reddit = praw.Reddit("bot1")

In [101]:
def get_post_data_for_range(start_date, end_date, subreddit):
    query = f'https://api.pushshift.io/reddit/search/submission/?subreddit={subreddit}&size=500&before={end_date}&after={start_date}'
    post_data = requests.get(query)
    return post_data.json()['data']
    
def filter_submission(total_submission_dict, post):
    submission = reddit.submission(id=post['id'])
    if (submission.link_flair_text is not None):
        total_submission_dict['date'].append(post['created_utc'])
        total_submission_dict['id'].append(post['id'])
        total_submission_dict['flair'].append(submission.link_flair_text)
        total_submission_dict['title'].append(post['title'])
        total_submission_dict['selftext'].append(post['selftext'])

def get_post_data(start_date, end_date, subreddit):
    total_submission_dict = {
        'date': [], 
        'id': [],
        'flair': [],
        'title': [],
        'selftext': []
    }
    
    post_data = get_post_data_for_range(start_date, end_date, subreddit)
    while len(post_data) > 0:
        for submission in post_data:
            filter_submission(total_submission_dict, submission)

        start_date = post_data[-1]['created_utc']
        post_data = get_post_data_for_range(start_date, end_date, subreddit)
    return total_submission_dict
    

# Geegees Post Classification

In [None]:
start_date = ((datetime.datetime.utcnow() - relativedelta(years=1)))
dates = [int((start_date + relativedelta(months=x)).timestamp()) for x in range(12)]
with concurrent.futures.ThreadPoolExecutor() as executor:
    threads= [executor.submit(get_post_data, dates[month], dates[month+1], 'geegees') for month in range(0,len(dates),2)]
    post_data_per_month = [thread.result() for thread in threads]

In [169]:
#Data was collected from cell above but at an earlier date
df = pd.read_csv('geegees_data.csv')
df.drop(['selftext'], axis=1, inplace=True)
df.dropna(inplace=True)
df.head()

Unnamed: 0,date,id,flair,title
0,1576089234,e9b0ez,Image/Screenshot,"campus blues, good luck on your finals 🙌🏼"
1,1576487097,ebcs7g,Shitpost,A few more days everyone.
2,1576507658,ebg18x,Discussion,Prof locked the door on an exam
3,1576559386,ebrb2u,Shitpost,That time of the year again
4,1580349688,evxpb0,Image/Screenshot,"There, fixed it after the recent scientology a..."


## Data Cleansing

In [170]:
from nltk.tokenize import RegexpTokenizer
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer
import regex as re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('words')
from nltk.corpus import words

#Making sure titles are in english
Word = list(set(words.words()))
df = df[df['title'].str.contains('|'.join(Word))]

df.head()


[nltk_data] Downloading package words to C:\Users\Aman
[nltk_data]     Riat\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


Unnamed: 0,date,id,flair,title
0,1576089234,e9b0ez,Image/Screenshot,"campus blues, good luck on your finals 🙌🏼"
1,1576487097,ebcs7g,Shitpost,A few more days everyone.
2,1576507658,ebg18x,Discussion,Prof locked the door on an exam
3,1576559386,ebrb2u,Shitpost,That time of the year again
4,1580349688,evxpb0,Image/Screenshot,"There, fixed it after the recent scientology a..."


In [171]:
df['flair'] = (df['flair'] == 'Shitpost').astype(int)
df['flair'].value_counts()

0    353
1     53
Name: flair, dtype: int64

### Resample to remove bias towards non target class

In [172]:
count_class_0, count_class_1 = df['flair'].value_counts()

# Divide by class
df_class_0 = df[df['flair'] == 0]
df_class_1 = df[df['flair'] == 1]
df_class_0 = df_class_0[0:93]
df = pd.concat([df_class_0, df_class_1], axis=0)

df['flair'].value_counts()

0    93
1    53
Name: flair, dtype: int64

In [173]:
#Tokenize the titles
def tokenize(x):
    if not x:
        x = ''
    tokenizer = RegexpTokenizer(r'\w+')
    return tokenizer.tokenize(x)
df.loc[:,'tokens'] = df['title'].map(tokenize)

In [174]:
#Lemming and stemming to reduce similar based words to a common word
nltk.download('wordnet')
def stemmer(x):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in x])
 
def lemmatize(x):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in x])

[nltk_data] Downloading package wordnet to C:\Users\Aman
[nltk_data]     Riat\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [175]:
df['lemma'] = df['tokens'].map(lemmatize)
df['stems'] = df['tokens'].map(stemmer)

## Model Training

In [176]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [177]:
X = df['lemma']
y = df['flair']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 25)

In [178]:
bnb_pipe = Pipeline(steps = [('tf', TfidfVectorizer()), ('mnb', BernoulliNB())])

In [179]:
param_grid = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__stop_words' : ['english', None],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'mnb__alpha' : [0.1, 0.5, 1]
}

In [180]:
gscv_mnb = GridSearchCV(bnb_pipe, param_grid,cv=5,n_jobs=-1)
gscv_mnb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('mnb', BernoulliNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': [0.1, 0.5, 1],
                         'tf__max_features': [1000, 2000, 3000],
                         'tf__ngram_range': [(1, 1), (1, 2)],
                         'tf__stop_words': ['english', None],
                         'tf__use_idf': [True, False]})

#### Not terrible but could be better

In [186]:
y_pred = gscv_mnb.predict(X_test)
gscv_mnb.score(X_test, y_test)

0.7027027027027027

In [187]:
gscv_mnb.best_params_

{'mnb__alpha': 0.5,
 'tf__max_features': 1000,
 'tf__ngram_range': (1, 2),
 'tf__stop_words': 'english',
 'tf__use_idf': True}

#### A lot of false negatives

In [189]:
metrics.confusion_matrix(y_test, y_pred)

array([[24,  0],
       [11,  2]], dtype=int64)

## Transformer Model Test

I'll be trying to use a transformer model in this section. Although I don't expect it to work well (due to the lack of data), why not try it.

In [94]:
df = pd.read_csv('geegees_data.csv')
df.drop(['selftext'], axis=1, inplace=True)
df.dropna(inplace=True)
df = df[df['title'].str.contains('|'.join(Word))]
df['flair'] = (df['flair'] == 'Shitpost').astype(int)

In [95]:

count_class_0, count_class_1 = df['flair'].value_counts()

# Divide by class
df_class_0 = df[df['flair'] == 0]
df_class_1 = df[df['flair'] == 1]
df_class_0 = df_class_0[0:93]
df = pd.concat([df_class_0, df_class_1], axis=0)

In [96]:
transformer_df= pd.DataFrame({
    'text': df['title'].replace(r'\n', ' ', regex=True),
    'label':df['flair']
})

In [97]:
train, test = train_test_split(transformer_df, test_size=0.2)

In [98]:
from simpletransformers.classification import ClassificationModel


# Create a TransformerModel
model = ClassificationModel('roberta', 'roberta-base', use_cuda=False, args={'overwrite_output_dir': True})

# Train the model
model.train_model(train)

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=116.0), HTML(value='')))




HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=15.0), HTML(value='')))











HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30.0), HTML(value='')))




HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=4.0), HTML(value='')))




  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


##### Not great as expected

In [190]:
result

{'mcc': 0.0,
 'tp': 0,
 'tn': 17,
 'fp': 0,
 'fn': 13,
 'acc': 0.5666666666666667,
 'eval_loss': 0.6821510642766953}

# Gadgets Post Classification

Data collected for the last 15 days due to large amount of data that is usually posted within a year

In [None]:
start_date = ((datetime.datetime.utcnow() - relativedelta(days=15))).timestamp()
end_date = datetime.datetime.utcnow().timestamp()
last_15_days_data = get_post_data(1604881881, 1606100329, 'gadgets')

In [None]:
df = pd.DataFrame(last_15_days_data)

In [116]:
df = pd.read_csv('gadgets_data.csv')
df['flair'].value_counts()

VR / AR                 97
Phones                  30
Desktops / Laptops      23
Computer peripherals    14
Wearables               11
Medical                 11
Transportation          11
Gaming                  10
Home                    10
Discussion               9
Music                    7
Watches                  6
Rule 2                   5
TV / Projectors          5
Cameras                  4
Misc                     4
Homemade                 3
Not A Gadget             2
Phone Accessories        2
Blogspam                 1
*SMS-based               1
Drones / UAVs            1
Rule 5                   1
Tablets                  1
locked                   1
Name: flair, dtype: int64

In [117]:
df = pd.read_csv('gadgets_data.csv')
df.drop(['selftext'], axis=1, inplace=True)
df.dropna(inplace=True)
df['flair'].value_counts()

VR / AR                 97
Phones                  30
Desktops / Laptops      23
Computer peripherals    14
Wearables               11
Medical                 11
Transportation          11
Gaming                  10
Home                    10
Discussion               9
Music                    7
Watches                  6
Rule 2                   5
TV / Projectors          5
Cameras                  4
Misc                     4
Homemade                 3
Not A Gadget             2
Phone Accessories        2
Blogspam                 1
*SMS-based               1
Drones / UAVs            1
Rule 5                   1
Tablets                  1
locked                   1
Name: flair, dtype: int64

## Data Cleansing

In [118]:
Word = list(set(words.words()))
df = df[df['title'].str.contains('|'.join(Word))]

df.head()

Unnamed: 0,date,id,flair,title
0,1604891928,jqpvp6,Not A Gadget,Maintenance Robot Walks on a Wind Turbine's Bl...
1,1604896733,jqr3bg,Rule 2,"Back here with, affordable smart watches in th..."
2,1604897235,jqr7io,Music,iFi NEO iDSD: New HiRes Professional Headphone...
3,1604899565,jqrr2a,Phones,moto G8 power lite launched in India | Specifi...
4,1604900214,jqrwa4,Phones,moto G8 power lite launched in India | Specifi...


In [119]:
targets= list(df['flair'].value_counts()[0:5].index)
target_df = df[df['flair'].isin(targets)]
non_target_df = df[~df['flair'].isin(targets)]

In [120]:
#Flair with value 5 means other
from sklearn.preprocessing import LabelEncoder  

le = LabelEncoder()
target_df.loc[:,'flair'] = le.fit_transform(target_df['flair'])
non_target_df.loc[:, 'flair'] = 5
le.inverse_transform([0,1,2,3,4])

array(['Computer peripherals', 'Desktops / Laptops', 'Medical', 'Phones',
       'VR / AR'], dtype=object)

In [110]:
#Comparing data split
print(len(target_df), len(non_target_df))

173 91


In [111]:
df = pd.concat([target_df, non_target_df], axis=0, join='outer', ignore_index=False, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True)

In [112]:
#Lemming and stemming to reduce similar based words to a common word
df.loc[:,'tokens'] = target_df['title'].map(tokenize)
df.dropna(inplace=True)
df['lemma'] = df['tokens'].map(lemmatize)
df['stems'] = df['tokens'].map(stemmer)

## Model Training

In [113]:
from sklearn.naive_bayes import MultinomialNB
X = df['lemma']
y = df['flair']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 13)

In [311]:
mnb_pipe = Pipeline(steps = [('tf', TfidfVectorizer()), ('mnb', MultinomialNB())])

In [312]:
param_grid = {
 'tf__max_features' : [1000, 2000, 3000],
 'tf__stop_words' : ['english', None],
 'tf__ngram_range' : [(1,1),(1,2)],
 'tf__use_idf' : [True, False],
 'mnb__alpha' : [0.1, 0.5, 1]
}

In [313]:
gscv_mnb = GridSearchCV(pipe_mnnb, param_grid,cv=5,n_jobs=-1)
gscv_mnb.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('tf', TfidfVectorizer()),
                                       ('mnb', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'mnb__alpha': [0.1, 0.5, 1],
                         'tf__max_features': [1000, 2000, 3000],
                         'tf__ngram_range': [(1, 1), (1, 2)],
                         'tf__stop_words': ['english', None],
                         'tf__use_idf': [True, False]})

In [314]:
y_pred = gs_mnnb.predict(X_test)
gscv_mnb.score(X_test, y_test)

0.8181818181818182

In [315]:
gscv_mnb.best_params_

{'mnb__alpha': 0.1,
 'tf__max_features': 1000,
 'tf__ngram_range': (1, 1),
 'tf__stop_words': 'english',
 'tf__use_idf': False}

In [316]:
metrics.confusion_matrix(y_test, y_pred)

array([[ 0,  2,  0,  0,  1],
       [ 0,  3,  0,  0,  0],
       [ 0,  0,  0,  1,  2],
       [ 0,  0,  0,  5,  2],
       [ 0,  0,  0,  0, 28]], dtype=int64)