### Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## preprocessing - NLP
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

## workflow
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

## models
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

## metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score, recall_score, precision_score, f1_score, confusion_matrix

In [2]:
%run 00_Workflow_Functions.ipynb import na_only, api_call, data_wrangling

In [3]:
subs = pd.read_csv('../datasets/submissions_data.csv')
subs.shape

(2178, 10)

In [4]:
subs.head(5)

Unnamed: 0,author,author_fullname,created_utc,selftext,title,subreddit,is_video,num_comments,score,upvote_ratio
0,Cutesifer_101,t2_7zsy6pvt,1651176966,The damage wasn’t that bad other than the fact...,I spilled monster energy,lifehacks,False,0,1,1.0
1,Subtotalpoet,t2_dcvoz53,1651176676,,"Wife forgot ur favorite ice cream? Improvise, ...",lifehacks,False,0,1,1.0
2,amintowords,t2_racie,1651175651,Set an alarm for an hour after you're meant to...,How to remember to take tablets on time,lifehacks,False,0,1,1.0
3,rokokslot87,t2_kv1oregh,1651174857,,SLOT ONLINE MENANG BESAR | SLOT DEPOSIT PULSA,lifehacks,False,0,1,1.0
4,Distinct_Expert_7648,t2_mczxpj82,1651174438,[removed],Infertility Clinic in Pune,lifehacks,False,0,1,1.0


In [5]:
subs.tail(5)

Unnamed: 0,author,author_fullname,created_utc,selftext,title,subreddit,is_video,num_comments,score,upvote_ratio
2173,Bloodstone2012,t2_jt1jo,1650646779,,LPT: it is healthier to skip a meal and just d...,LifeProTips,False,1,1,1.0
2174,cross_peach,t2_30cjlc75,1650646116,,LPT: You can access several online courses and...,LifeProTips,False,1,1,1.0
2175,jmincorporated,t2_16tlrj,1650645631,,LPT - when you receive a medical bill always c...,LifeProTips,False,1,1,1.0
2176,the_women_era,t2_m8qtr3lc,1650645336,[removed],People Will Always Judge Your Actions. No Matt...,LifeProTips,False,1,1,1.0
2177,patchaclus,t2_9411lb63,1650645171,,LPT: you can set rules on your email to redire...,LifeProTips,False,1,1,1.0


In [6]:
subs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2178 entries, 0 to 2177
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   author           2178 non-null   object 
 1   author_fullname  2178 non-null   object 
 2   created_utc      2178 non-null   int64  
 3   selftext         1298 non-null   object 
 4   title            2178 non-null   object 
 5   subreddit        2178 non-null   object 
 6   is_video         2178 non-null   bool   
 7   num_comments     2178 non-null   int64  
 8   score            2178 non-null   int64  
 9   upvote_ratio     2178 non-null   float64
dtypes: bool(1), float64(1), int64(3), object(5)
memory usage: 155.4+ KB


In [7]:
na_only(subs)

selftext    880
dtype: int64

When we wrangled the data, we had no NAs reported. However, after exporting the CSV and importing the CSV, we now have NAs present. Also, there appears to be a particular value `[removed]` that indicates a submission was deleted, and data was redacted. We will need to drop all of this data for NLP, as it's not useful for us.

### Dropping erraneous data

In addition to `[removed]`, let's remove selftext posts that are less than 7 words long (subjective choice). We do this because a post of a few words may indicate the post is referencing a multimedia file or hypyerlink.

In [54]:
text_filter = np.array([[i, text] for i, text in enumerate(subs['selftext']) if len(text.split()) < 7])

In [55]:
pd.DataFrame(text_filter[:, 1]).value_counts()

[removed]                                                             716
Looking at you google fi                                                2
/storage/emulated/0/Download/IMG_20220327_220539366_103259.jpg          1
That’s where the juicy stuff is.                                        1
https://www.youtube.com/watch?v=Qwscb3QIVSg?t=                          1
[https://youtu.be/Ho53oi4nUBQ](https://youtu.be/Ho53oi4nUBQ)            1
[body odor blog](https://blogsbyq.com/2022/04/16/onion-body-odor/)      1
Use a little lip balm                                                   1
Trying to read small print                                              1
Source: I work in hotels                                                1
Go watch this https://youtube.com/c/ThatVeganTeacherYouTube             1
See above                                                               1
Reduced falls = increased longevity.                                    1
Ladies love it.                       

In [7]:
subs['subreddit'].value_counts(normalize=True)

lifehacks      0.502755
LifeProTips    0.497245
Name: subreddit, dtype: float64

Prior to dropping NAs, we have a nearly perfectly balance proportion between our two classes.

In [8]:
subs = subs.dropna()
subs.shape

(1298, 10)

In [9]:
subs['subreddit'].value_counts(normalize=True)

LifeProTips    0.644838
lifehacks      0.355162
Name: subreddit, dtype: float64

After dropping NAs, `LifeProTips` has become our majority class.

### Train and Test Splits

In [10]:
# we are only interested in the self-text
X = subs[['selftext']]
y = subs['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=14, stratify=y)

In [11]:
print(X_train.shape, X_test.shape)

(973, 1) (325, 1)


In [12]:
print(y_train.shape, y_test.shape)

(973,) (325,)


In [13]:
y_test.value_counts(normalize=True)

LifeProTips    0.646154
lifehacks      0.353846
Name: subreddit, dtype: float64

Our stratification worked well.

### Preprocessing

Transforming our response into labels of 1 and 0, where 1 is `lifehacks` and 0 is `LifeProTips`.

In [14]:
le = LabelEncoder()

In [15]:
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [16]:
pd.DataFrame(y_train_encoded).value_counts(normalize=True)

0    0.644399
1    0.355601
dtype: float64

In [17]:
na_only(X_train)

0

In [18]:
na_only(X_test)

0

No NAs present

In [36]:
X_train = X_train.dropna()
X_test = X_test.dropna()

In [37]:
print(X_train.shape, X_test.shape)

(973, 1) (325, 1)
