In [None]:
# Install dependencies
!pip install textblob

In [9]:
# Imports
import pandas as pd
import numpy as np
import os
from IPython.display import display

# Text preprocessing
import nltk
from nltk.corpus import stopwords
import re
import string
from textblob import TextBlob
from collections import Counter

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('universal_tagset')

# Instantiate stopWords
stopWords = stopwords.words("english")
    
# Instantiate wordnet lemmatizer
wn = nltk.WordNetLemmatizer()

from sklearn.feature_extraction.text import TfidfVectorizer

# Sentiment Analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\denni\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\universal_tagset.zip.


In [2]:
"""
Desc: Spelling correction and lemmatization
Input: text (string) - text to be corrected and lemmatized
Output: text (string) - corrected and lemmatized text
"""
def prepareText(text):
    
    # Spell Correction
    text = ''.join(TextBlob(text).correct())
    
    # Lemmatize the text
    text = ' '.join([wn.lemmatize(word) for word in text.split(' ')])
    
    return text

In [2]:
"""
Desc: POS (Parts of Speech) tagging for Nouns, Pronouns, Verbs, Adjectives, and Adverbs
Input: text (string)
Output: 4 columns of their respective POS counts 
"""
def posCount(text):
    
    # Tokenize the words in the text
    tokens = nltk.word_tokenize(text)
    
    # Assign POS tags to each words
    pos = nltk.pos_tag(tokens, tagset='universal')
    
    # Count the POS tags
    counts = Counter(tag for _, tag in pos)
    
    # Get individual counts for POS of interests
    noun = counts["NOUN"]
    pronoun = counts["PRON"]
    verb = counts["VERB"]
    adjective = counts["ADJ"]
    adverb = counts["ADV"]
    
    return noun, pronoun, verb, adjective, adverb

In [12]:
# Load data
df = pd.read_csv("./preproc_data/clean_text_all_subreddits.csv")

df

Unnamed: 0,subreddit,date,author,id,num_comments,score,text,link_flair_text,post_length,clean_text
0,ADHD,01/01/2019,DirtJunkie133,abd11x,13.0,1,"Recently diagnosed, need to talk to others who...",,3021,recently diagnosed need talk others diagnosed ...
1,ADHD,01/01/2019,Lin_the_pillow_artis,abd7q9,5.0,1,Really annoyed at my familys drunk friends So ...,,751,really annoyed familys drunk friends family ho...
2,ADHD,01/01/2019,Fleetfeathers,abda0t,12.0,1,The medication journey: a current disappointme...,,1713,medication journey current disappointment wan...
3,ADHD,01/01/2019,UnleashedDebs,abdd13,4.0,1,"Wearables, REM sleep detected while gaming not...",,988,wearables rem sleep detected gaming sleeping h...
4,ADHD,01/01/2019,liluglee,abdj4w,1.0,1,Picking a friend up to carpool to a NYE party....,,123,picking friend carpool nye party texted way dr...
...,...,...,...,...,...,...,...,...,...,...
652198,sad,31/12/2020,RussianPower69,ko0jbp,3.0,1,Me sad Me has no one to talk to on New Year,Depression/Sadness,43,sad one talk new year
652199,sad,31/12/2020,prettygirlolivia,ko0lec,8.0,1,Worst New Years Eve Ever My depression has hit...,,318,worst new years eve ever depression hit bad la...
652200,sad,31/12/2020,DirtyLizard0032,ko0rtl,2.0,1,Check out my sad song,Depression/Sadness,22,check sad song
652201,sad,31/12/2020,Music-SunsetGirl490,ko0skv,3.0,1,Zoom Wedding Tomorrow! So here is the story. L...,Loneliness,486,zoom wedding tomorrow story lets call groom bo...


In [None]:
# Create a new column with clean text
df["clean_text"] = df["clean_text"].apply(prepareText)

# Drop any rows with NaN in clean_text
df = df.dropna(subset=['clean_text'])

# Reset index
df = df.reset_index(drop=True)

# Remove duplicate words
df["unique_clean_text"] = df["clean_text"].apply(lambda x: " ".join(dict.fromkeys(x.split())))

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits_BoW.csv", index=False)

In [13]:
%%time

# Get 5 most common POS counts
test = df[:100].apply(lambda x: posCount(x['clean_text']), axis=1, result_type = "expand")

test

Wall time: 610 ms


Unnamed: 0,0,1,2,3,4
0,116,0,75,45,26
1,31,0,16,10,7
2,67,0,38,30,10
3,32,0,25,19,9
4,8,0,3,1,0
...,...,...,...,...,...
95,45,0,24,23,8
96,30,0,26,16,7
97,74,0,44,42,13
98,12,0,6,15,2


In [None]:
# Get 5 most common POS counts
df[["noun", "pronoun", "verb", "adjective", "adverb"]] = df.apply(lambda x: posCount(x['unique_clean_text']), axis=1, result_type = "expand")

# Get word count of clean text
df["word_count"] = df["unique_clean_text"].str.split().str.len()

# Normalize POS Counts
df["norm_noun"] = df["noun"] / df["word_count"]
df["norm_pronoun"] = df["pronoun"] / df["word_count"]
df["norm_verb"] = df["verb"] / df["word_count"]
df["norm_adj"] = df["adj"] / df["word_count"]
df["norm_adv"] = df["norm_adv"] / df["word_count"]

# Drop text column and re-arrange columns
df = df[["subreddit", "date", "author", "id", "num_comments", "score", 
         "clean_text", "unique_clean_text", "post_length", "word_count", 
         "norm_noun", "norm_pronoun", "norm_verb", "norm_adj", "norm_adv", 
         "link_flair_text"]]

# Display
display(df)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits_BoW.csv", index=False)

In [None]:
# List unique link_flair_text
for flair in df["link_flair_text"].unique():
    print(flair)

print(len(df["link_flair_text"].unique()))

In [None]:
# Perform sentiment analysis
va = SentimentIntensityAnalyzer()

# Create new column with compound sentiment
df["compound_sent"] = df["clean_text"].apply(lambda x: va.polarity_scores(x)['compound'])

# Display
print(df)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits_BoW.csv", index=False)

In [None]:
# Perform TF-IDF Count Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_text"])

print(vectorizer.get_feature_names_out())

In [None]:
# Check shapes
print(X.shape)
print(df.shape)

# Convert X to a df
wc = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())

# Create a row of sums
wc.loc['Total'] = wc.sum()

# Check
display(wc.tail())

# Sort wc by values in Total row
wc.sort_values(by="Total", axis=1, ascending=False, inplace=True)

# Drop Total row and select first 2000 columns
wc = wc.iloc[:-1, :2000]

# Check
display(wc.head())

In [None]:
# Concatenate wc with df
df = pd.concat([df, wc], axis=1)

# Save to csv
df.to_csv("./preproc_data/clean_text_all_subreddits_BoW.csv", index=False)