In [1]:
# Imports
import pandas as pd
import numpy as np
import os
from IPython.display import display

# Text preprocessing
import nltk
from nltk.corpus import stopwords
import re
import string
nltk.download('stopwords')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
"""
Desc: Text preprocessing of social media text using 
      typical text cleaning methods such as lowercasing, removal of mentions, etc.
Input: text (string) - text to be cleaned
Output: text (string) - cleaned text
"""
def prepareText(text):
    
    # Instantiate stopWords
    #stopWords = stopwords.words("english")
    
    # Instantiate wordnet lemmatizer
    #wn = nltk.WordNetLemmatizer()
    
    # Lower case
    text = text.lower()

    # Remove all stop words
    #text = ' '.join([word for word in text.split(' ') if word not in stopWords])

    # Remove unicode characters (emojis, etc.)
    text = text.encode('ascii', 'ignore').decode()

    # Remove urls
    text = re.sub(r'http*\S+', ' ', text)

    # Remove multi-character symbols (\n, \t, \r)
    text = re.sub(r'[\n\r\t]', ' ', text)

    # Remove numeric values
    text = re.sub(r'[0-9]', ' ', text)

    # Remove punctuation
    #text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

    # Remove extra blank spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Lemmatize the text
    #text = ' '.join([wn.lemmatize(word) for word in text.split(' ')])
    
    return text

In [3]:
"""
Desc: Text preprocessing of social media text using 
      typical text cleaning methods such as lowercasing, removal of mentions, etc.
Input: text (string) - text to be cleaned
Output: text (string) - cleaned text
"""
def prepareText2(text):
    
    # Instantiate stopWords
    stopWords = stopwords.words("english")
    
    # Instantiate wordnet lemmatizer
    wn = nltk.WordNetLemmatizer()
    
    # Lower case
    text = text.lower()

    # Remove all stop words
    text = ' '.join([word for word in text.split(' ') if word not in stopWords])

    # Remove unicode characters (emojis, etc.)
    text = text.encode('ascii', 'ignore').decode()

    # Remove urls
    text = re.sub(r'http*\S+', ' ', text)

    # Remove multi-character symbols (\n, \t, \r)
    text = re.sub(r'[\n\r\t]', ' ', text)

    # Remove numeric values
    text = re.sub(r'[0-9]', ' ', text)

    # Remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)

    # Remove extra blank spaces
    text = re.sub(r'\s{2,}', ' ', text)

    # Lemmatize the text
    text = ' '.join([wn.lemmatize(word) for word in text.split(' ')])
    
    return text

In [4]:
# If the preproc_data forlder does not exist, create it
if not os.path.exists("./preproc_data/"):
    os.mkdir("./preproc_data/")

# Read the CSV files into a list
try:
    # List of csv files
    csvs = [f.name for f in os.scandir("./clean_data/") if f.name.endswith(".csv")]
    
    # Remove hidden directories
    csvs = [f for f in csvs if not f.startswith('.')]
    
    # Append directory as prefix to strings in list
    csvs = ['./clean_data/' + f for f in csvs]
    
    print(csvs)
except:
    print("The clean_data folder does not exist")

['./clean_data/clean_ADHD_2019_submission_data.csv', './clean_data/clean_ADHD_2020_submission_data.csv', './clean_data/clean_Anxiety_2019_submission_data.csv', './clean_data/clean_Anxiety_2020_submission_data.csv', './clean_data/clean_depression_help_2020_submission_data.csv', './clean_data/clean_mentalhealth_L3YR_submission_data.csv', './clean_data/clean_overcoming_2020_submission_data.csv', './clean_data/clean_sad_2020_submission_data.csv']


In [5]:
# Empty list to store dfs
files = []

# Apply basic preprocessing to each csv file
for c in csvs:
    print(c)
    files.append(pd.read_csv(c))

# Merge all files into one df
df = pd.concat(files, ignore_index=True)

df

./clean_data/clean_ADHD_2019_submission_data.csv
./clean_data/clean_ADHD_2020_submission_data.csv
./clean_data/clean_Anxiety_2019_submission_data.csv
./clean_data/clean_Anxiety_2020_submission_data.csv
./clean_data/clean_depression_help_2020_submission_data.csv
./clean_data/clean_mentalhealth_L3YR_submission_data.csv
./clean_data/clean_overcoming_2020_submission_data.csv
./clean_data/clean_sad_2020_submission_data.csv


Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,
...,...,...,...,...,...,...,...,...
492162,sad,31/12/2020,RussianPower69,ko0jbp,3.0,1,Me sad Me has no one to talk to on New Year,Depression/Sadness
492163,sad,31/12/2020,prettygirlolivia,ko0lec,8.0,1,Worst New Years Eve Ever My depression has hit...,
492164,sad,31/12/2020,DirtyLizard0032,ko0rtl,2.0,1,Check out my sad song,Depression/Sadness
492165,sad,31/12/2020,Music-SunsetGirl490,ko0skv,3.0,1,Zoom Wedding Tomorrow! So here is the story. L...,Loneliness


In [6]:
# Create a new column with clean text
df["clean_text_seq"] = df["text"].apply(prepareText) # Without lemmatizing, keep stopwords and punctuation
df["clean_text"] = df["text"].apply(prepareText2)

# Drop any rows with NaN in clean_text or clean_text_seq
df = df.dropna(subset=['clean_text'])
df = df.dropna(subset=['clean_text_seq'])

# Drop text column and re-arrange columns
df = df[["subreddit", "date", "author", "id", "num_comments", "score", 
         "clean_text_seq", "clean_text", "link_flair_text"]]

# Display
df

Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text,clean_text_seq,clean_text
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",,"recently diagnosed, need to talk to others who...",recently diagnosed need talk others diagnosed ...
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,,really annoyed at my familys drunk friends so ...,really annoyed family drunk friend family host...
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,,the medication journey: a current disappointme...,medication journey current disappointment want...
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",,"wearables, rem sleep detected while gaming not...",wearable rem sleep detected gaming sleeping hy...
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,,picking a friend up to carpool to a nye party....,picking friend carpool nye party texted on way...
...,...,...,...,...,...,...,...,...,...,...
492162,sad,31/12/2020,RussianPower69,ko0jbp,3.0,1,Me sad Me has no one to talk to on New Year,Depression/Sadness,me sad me has no one to talk to on new year,sad one talk new year
492163,sad,31/12/2020,prettygirlolivia,ko0lec,8.0,1,Worst New Years Eve Ever My depression has hit...,,worst new years eve ever my depression has hit...,worst new year eve ever depression hit bad las...
492164,sad,31/12/2020,DirtyLizard0032,ko0rtl,2.0,1,Check out my sad song,Depression/Sadness,check out my sad song,check sad song
492165,sad,31/12/2020,Music-SunsetGirl490,ko0skv,3.0,1,Zoom Wedding Tomorrow! So here is the story. L...,Loneliness,zoom wedding tomorrow! so here is the story. l...,zoom wedding tomorrow story let call groom bob...


In [9]:
# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits.csv", index=False)

       subreddit        date                author      id  num_comments  \
0           ADHD  01/01/2019         DirtJunkie133  abd11x          13.0   
1           ADHD  01/01/2019  Lin_the_pillow_artis  abd7q9           5.0   
2           ADHD  01/01/2019         Fleetfeathers  abda0t          12.0   
3           ADHD  01/01/2019         UnleashedDebs  abdd13           4.0   
4           ADHD  01/01/2019              liluglee  abdj4w           1.0   
...          ...         ...                   ...     ...           ...   
492162       sad  31/12/2020        RussianPower69  ko0jbp           3.0   
492163       sad  31/12/2020      prettygirlolivia  ko0lec           8.0   
492164       sad  31/12/2020       DirtyLizard0032  ko0rtl           2.0   
492165       sad  31/12/2020   Music-SunsetGirl490  ko0skv           3.0   
492166       sad  31/12/2020            Monte_1997  ko0vrb           2.0   

        score                                     clean_text_seq  \
0           1  rece

In [15]:
# List unique link_flair_text
for flair in df["link_flair_text"].unique():
    print(flair)

print(len(df["link_flair_text"].unique()))

nan
Rant/Vent
Questions/Advice/Support
Announcement
Success/Celebration
Tips/Suggestions
Accountability
Articles/Information
Reminder
Weeklies
Moderator Approved
Obsession Sharing!
🎉
New To Meds
Medication
DAE Questions
Advice Needed
Discussion
Share Your Victories
Needs A Hug/Support
Venting
Work/School
Health
Progress!
Relationship
Family/Relationship
Sleep
Helpful Tips!
Therapy
Travel
I Made This!
Help A Loved One
Pix Post:
New User Introductions
Driving
Book Recommendations
Research Study
Announcements
Introductions &amp; QOTW
Subreddit Challenge
Stickied Post
Positivity
Holiday MegaThread!
INSPIRATION
REQUESTING ADVICE
PROVIDING ADVICE
REQUESTING SUPPORT
PROVIDING SUPPORT
OTHER
RANT
MOTIVATION
STORY
uni gives me the heevie jeebies
OTHER RANT 
is this normal
STORY/ADVICE/MOTIVATION
i dont know
Thanks!
no idea what you asking you annoying fudge #$%^^#
QUESTION
Encouragement
Discord Server
URGENT
VENT
People to talk to??
OTHER (TROLL LOL LOL SATIRE LOL OLO LOLLOLL SATIRE LOL)
I Need 

In [32]:
# Perform sentiment analysis
va = SentimentIntensityAnalyzer()

# Create new column with compound sentiment
df["compound_sent"] = df["clean_text_seq"].apply(lambda x: va.polarity_scores(x)['compound'])

# Display
print(df)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits.csv", index=False)

       subreddit        date                author      id  num_comments  \
0           ADHD  01/01/2019         DirtJunkie133  abd11x          13.0   
1           ADHD  01/01/2019  Lin_the_pillow_artis  abd7q9           5.0   
2           ADHD  01/01/2019         Fleetfeathers  abda0t          12.0   
3           ADHD  01/01/2019         UnleashedDebs  abdd13           4.0   
4           ADHD  01/01/2019              liluglee  abdj4w           1.0   
...          ...         ...                   ...     ...           ...   
492162       sad  31/12/2020        RussianPower69  ko0jbp           3.0   
492163       sad  31/12/2020      prettygirlolivia  ko0lec           8.0   
492164       sad  31/12/2020       DirtyLizard0032  ko0rtl           2.0   
492165       sad  31/12/2020   Music-SunsetGirl490  ko0skv           3.0   
492166       sad  31/12/2020            Monte_1997  ko0vrb           2.0   

        score                                     clean_text_seq  \
0           1  rece

In [2]:
# Load data
df = pd.read_csv("./preproc_data/clean_text_all_subreddits.csv")

# Display
df.head()

Unnamed: 0,subreddit,date,author,id,num_comments,score,clean_text_seq,clean_text,link_flair_text,compound_sent
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"recently diagnosed, need to talk to others who...",recently diagnosed need talk others diagnosed ...,,0.9954
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,really annoyed at my familys drunk friends so ...,really annoyed family drunk friend family host...,,-0.9354
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,the medication journey: a current disappointme...,medication journey current disappointment want...,,0.9233
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"wearables, rem sleep detected while gaming not...",wearable rem sleep detected gaming sleeping hy...,,0.7738
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,picking a friend up to carpool to a nye party....,picking friend carpool nye party texted on way...,,0.8591


In [3]:
# Perform TF-IDF Count Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_text"])

print(vectorizer.get_feature_names_out())

['aa' 'aaa' 'aaaa' ... 'zzzzzzzzt' 'zzzzzzzzzzzz'
 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz']


In [4]:
# Check shapes
print(X.shape)
print(df.shape)

# Convert X to a df
wc = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())

# Create a row of sums
wc.loc['Total'] = wc.sum()

# Check
display(wc.tail())

# Sort wc by values in Total row
wc.sort_values(by="Total", axis=1, ascending=False, inplace=True)

# Drop Total row and select first 2000 columns
wc = wc.iloc[:-1, :2000]

# Check
display(wc.head())

(492126, 150199)
(492126, 10)


Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaa,aaaaaaaa,aaaaaaaaa,aaaaaaaaaa,aaaaaaaaaaa,...,zzzquils,zzzs,zzzt,zzzz,zzzzhdjfjdjgjdhsfhd,zzzzz,zzzzzzz,zzzzzzzzt,zzzzzzzzzzzz,zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz
492122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
492123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
492124,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
492125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Total,42.28926,9.694867,5.985684,6.105736,4.962394,3.230329,2.663569,4.9767,1.347266,1.49675,...,0.476926,0.733375,0.05421,0.724784,0.309076,0.484447,0.317893,0.05421,0.612103,0.164345


Unnamed: 0,im,like,feel,it,anxiety,get,know,time,want,really,...,delete,evaluation,psychological,stigma,review,photo,irritated,distant,sleepy,burning
0,0.184747,0.070373,0.0,0.078927,0.0,0.039981,0.020194,0.041989,0.046543,0.045421,...,0.0,0.0,0.0,0.0,0.070511,0.0,0.0,0.0,0.0,0.0
1,0.075903,0.028913,0.0,0.064854,0.0,0.0,0.066374,0.0,0.0,0.037322,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.050459,0.081835,0.0,0.0,0.086002,0.028959,0.0,0.033372,0.032567,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.095247,0.0,0.0,0.0,0.0,0.0,0.0,0.028864,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Concatenate wc with df
df = pd.concat([df, wc], ignore_index=True, axis=1)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits_BoW.csv", index=False)