In [1]:
# Imports
import pandas as pd
import numpy as np
from IPython.display import display

import nltk
from nltk.corpus import stopwords
import re
import string

nltk.download('stopwords')
nltk.download('wordnet')

# Instantiate stopWords
stopWords = stopwords.words("english")

# Instantiate wordnet lemmatizer
wn = nltk.WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
"""
Desc: Lemmatization
Input: text (string) - text to be lemmatized
Output: text (string) - lemmatized text
"""
def lemmatize(text):
    
    # Lemmatize the text
    text = ' '.join([wn.lemmatize(word) for word in text.split(' ')])
    
    return text

In [3]:
# Load data
df = pd.read_csv("./preproc_data/clean_text_all_subreddits.csv", low_memory=False)

# Check
display(df.head())

Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text,post_length,clean_text
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",,3021,recently diagnosed need talk others diagnosed ...
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,,751,really annoyed familys drunk friends family ho...
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,,1713,medication journey current disappointment wan...
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",,988,wearables rem sleep detected gaming sleeping h...
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,,123,picking friend carpool nye party texted way dr...


In [5]:
# Text Preprocessing (Keep stopwords, don't lemmatize)
# Lower case
df["stopwords_kept"] = df["text"].str.lower()

# Remove unicode characters (emojis, etc.)
df["stopwords_kept"] = df["stopwords_kept"].str.encode('ascii', 'ignore').str.decode('utf-8')

# Remove urls
df["stopwords_kept"] = df["stopwords_kept"].str.replace(r'http*\S+', ' ', regex=True)

# Remove multi-character symbols (\n, \t, \r)
df["stopwords_kept"] = df["stopwords_kept"].str.replace(r'[\n\r\t]', ' ', regex=True)

# Remove numeric values
df["stopwords_kept"] = df["stopwords_kept"].str.replace(r'[0-9]', ' ', regex=True)

# Reduce repeated letters
df["stopwords_kept"] = df["stopwords_kept"].str.replace(re.compile(r"(.)\1{2,}"), r"\1\1", regex=True)

# Remove punctuation
df["stopwords_kept"] = df["stopwords_kept"].str.replace('[%s]' % re.escape(string.punctuation), ' ', regex=True)

# Remove extra blank spaces
df["stopwords_kept"] = df["stopwords_kept"].str.replace(r'\s{2,}', ' ', regex=True)

# Check
df.head()

Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text,post_length,clean_text,stopwords_kept
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",,3021,recently diagnosed need talk others diagnosed ...,recently diagnosed need to talk to others who ...
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,,751,really annoyed familys drunk friends family ho...,really annoyed at my familys drunk friends so ...
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,,1713,medication journey current disappointment wan...,the medication journey a current disappointmen...
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",,988,wearables rem sleep detected gaming sleeping h...,wearables rem sleep detected while gaming not ...
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,,123,picking friend carpool nye party texted way dr...,picking a friend up to carpool to a nye party ...


In [6]:
# Text Preprocessing (Remove stop words, don't lemmatize)
# Lower case
df["stopwords_removed"] = df["text"].str.lower()

# Remove unicode characters (emojis, etc.)
df["stopwords_removed"] = df["stopwords_removed"].str.encode('ascii', 'ignore').str.decode('utf-8')

# Remove urls
df["stopwords_removed"] = df["stopwords_removed"].str.replace(r'http*\S+', ' ', regex=True)

# Remove multi-character symbols (\n, \t, \r)
df["stopwords_removed"] = df["stopwords_removed"].str.replace(r'[\n\r\t]', ' ', regex=True)

# Remove numeric values
df["stopwords_removed"] = df["stopwords_removed"].str.replace(r'[0-9]', ' ', regex=True)

# Reduce repeated letters
df["stopwords_removed"] = df["stopwords_removed"].str.replace(re.compile(r"(.)\1{2,}"), r"\1\1", regex=True)

# Remove stop words
pat = r'\b(?:{})\b'.format('|'.join(stopWords))
df["stopwords_removed"] = df["stopwords_removed"].str.replace(pat, '', regex=True)

# Remove punctuation
df["stopwords_removed"] = df["stopwords_removed"].str.replace('[%s]' % re.escape(string.punctuation), ' ', regex=True)

# Remove stop words again (in case stop word was next to punctuation)
pat = r'\b(?:{})\b'.format('|'.join(stopWords))
df["stopwords_removed"] = df["stopwords_removed"].str.replace(pat, '', regex=True)

# Remove extra blank spaces
df["stopwords_removed"] = df["stopwords_removed"].str.replace(r'\s{2,}', ' ', regex=True)

# Check
df.head()

Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text,post_length,clean_text,stopwords_kept,stopwords_removed
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",,3021,recently diagnosed need talk others diagnosed ...,recently diagnosed need to talk to others who ...,recently diagnosed need talk others diagnosed ...
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,,751,really annoyed familys drunk friends family ho...,really annoyed at my familys drunk friends so ...,really annoyed familys drunk friends family ho...
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,,1713,medication journey current disappointment wan...,the medication journey a current disappointmen...,medication journey current disappointment wan...
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",,988,wearables rem sleep detected gaming sleeping h...,wearables rem sleep detected while gaming not ...,wearables rem sleep detected gaming sleeping h...
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,,123,picking friend carpool nye party texted way dr...,picking a friend up to carpool to a nye party ...,picking friend carpool nye party texted way dr...


In [7]:
# lemmatize
df["lemmatized"] = df["stopwords_removed"].apply(lemmatize)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits.csv", index=False)

In [8]:
# Drop clean text column
df = df.drop(columns=["clean_text"])

In [10]:
# Load labels
labels = pd.read_csv("./preproc_data/clean_text_all_subreddits_d2v.csv",
                     usecols=["id", "label"], low_memory=False)

# Check
display(labels.head())

Unnamed: 0,id,label
0,abd11x,0
1,abd7q9,0
2,abda0t,0
3,abdd13,0
4,abdj4w,0


In [11]:
# Merge with df
df = df.merge(labels, how="left")

# Display
display(df.head())

Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text,post_length,stopwords_kept,stopwords_removed,lemmatized,label
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",,3021,recently diagnosed need to talk to others who ...,recently diagnosed need talk others diagnosed ...,recently diagnosed need talk others diagnosed ...,0.0
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,,751,really annoyed at my familys drunk friends so ...,really annoyed familys drunk friends family ho...,really annoyed family drunk friend family host...,0.0
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,,1713,the medication journey a current disappointmen...,medication journey current disappointment wan...,medication journey current disappointment wan...,0.0
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",,988,wearables rem sleep detected while gaming not ...,wearables rem sleep detected gaming sleeping h...,wearable rem sleep detected gaming sleeping hy...,0.0
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,,123,picking a friend up to carpool to a nye party ...,picking friend carpool nye party texted way dr...,picking friend carpool nye party texted way dr...,0.0


In [13]:
# Drop NaNs in label
df = df.dropna(subset=["label"])

# Reset index
df = df.reset_index()

#  Change label to int
df["label"] = df["label"].astype(int)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits.csv", index=False)