In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
import pandas as pd
import numpy as np
import re
import html
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# load the mental health dataset
data = pd.read_csv("Mental-Health-Twitter.csv")

In [None]:
X = data["post_text"]
y = data["label"]

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [None]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stopwords composed of A-Z & a-z only
    in lowercase'''


    # lowercasing...
    sentence = sen.lower()

    # Remove RT symbol
    sentence = re.sub(r'^(RT|rt)[\s]+', '', sentence)

    # Remove username @ symbol
    sentence = re.sub(r'@\w+', '', sentence)

    # Remove the word 'user'
    sentence = re.sub(r'\buser\b', '', sentence)

    # Remove the word 'yong'
    sentence = re.sub(r'\byong\b', '', sentence)

    # Remove the word 'aleph'
    sentence = re.sub(r'\baleph\b', '', sentence)

    # Remove the word 'paytforluckysun'
    sentence = re.sub(r'\bpaytforluckysun\b', '', sentence)

    # Remove the word 'joe'
    sentence = re.sub(r'\bjoe\b', '', sentence)

    # Remove the word 'wearepayting'
    sentence = re.sub(r'\bwearepayting\b', '', sentence)

    # Remove the word 'foryong'
    sentence = re.sub(r'\bforyong\b', '', sentence)

    # Remove the word 'sos'
    sentence = re.sub(r'\bsos\b', '', sentence)

    # Remove the word 'mnwild'
    sentence = re.sub(r'\bmnwild\b', '', sentence)

    # Remove the word 'bbmas'
    sentence = re.sub(r'\bbbmas\b', '', sentence)

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove URLs/links
    sentence = re.sub(r"http\S+", "", sentence)

    # Replace html encoded characters
    sentence = html.unescape(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [None]:
data['preprocessed_text'] = data['post_text'].apply(preprocess_text)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Filter data for label 0
df_label0 = data[data['label'] == 0]

# Create count vectorizer
vectorizer0 = CountVectorizer()
X0 = vectorizer0.fit_transform(df_label0['preprocessed_text'])

# Perform LDA
lda0 = LatentDirichletAllocation(n_components=5, random_state=0)
lda0.fit(X0)

# Get topics and print top words for each topic
feature_names0 = vectorizer0.get_feature_names_out()
for topic_idx, topic in enumerate(lda0.components_):
    print("Topic %d:" % (topic_idx))
    print([feature_names0[i] for i in topic.argsort()[:-10 - 1:-1]])


# Filter data for label 1
df_label1 = data[data['label'] == 1]

# Create count vectorizer
vectorizer1 = CountVectorizer()
X1 = vectorizer1.fit_transform(df_label1['preprocessed_text'])

# Perform LDA
lda1 = LatentDirichletAllocation(n_components=5, random_state=0)
lda1.fit(X1)

# Get topics and print top words for each topic
feature_names1 = vectorizer1.get_feature_names_out()
for topic_idx, topic in enumerate(lda1.components_):
    print("Topic %d:" % (topic_idx))
    print([feature_names1[i] for i in topic.argsort()[:-10 - 1:-1]])



Topic 0:
['like', 'get', 'time', 'trump', 'look', 'one', 'know', 'christmas', 'really', 'think']
Topic 1:
['hey', 'thanks', 'oh', 'best', 'follow', 'zayn', 'bestmusicvideo', 'pillowtalk', 'iheartawards', 'never']
Topic 2:
['trump', 'like', 'putin', 'one', 'going', 'us', 'hate', 'via', 'america', 'russia']
Topic 3:
['thank', 'say', 'twitter', 'love', 'following', 'hello', 'anytime', 'fuck', 'know', 'real']
Topic 4:
['new', 'people', 'year', 'see', 'need', 'game', 'one', 'still', 'go', 'lightsaber']
Topic 0:
['depression', 'treatments', 'overcome', 'anxiety', 'health', 'treatment', 'therapy', 'new', 'like', 'help']
Topic 1:
['love', 'good', 'talk', 'like', 'day', 'today', 'get', 'one', 'want', 'see']
Topic 2:
['people', 'positive', 'get', 'via', 'thinking', 'life', 'headache', 'like', 'migraine', 'negative']
Topic 3:
['go', 'one', 'autism', 'please', 'us', 'know', 'love', 'new', 'make', 'need']
Topic 4:
['like', 'know', 'get', 'shit', 'right', 'im', 'still', 'people', 'feel', 'pain']


In [None]:
# Create count vectorizer
vectorizer_all = CountVectorizer()
X_all = vectorizer_all.fit_transform(data['preprocessed_text'])

# Perform LDA
lda_all = LatentDirichletAllocation(n_components=10, random_state=0)
lda_all.fit(X_all)

# Get topics and print top words for each topic
feature_names_all = vectorizer_all.get_feature_names_out()
for topic_idx, topic in enumerate(lda_all.components_):
    print("Topic %d:" % (topic_idx))
    print([feature_names_all[i] for i in topic.argsort()[:-10 - 1:-1]])


Topic 0:
['new', 'think', 'would', 'year', 'better', 'like', 'never', 'one', 'people', 'even']
Topic 1:
['please', 'go', 'trump', 'people', 'nanny', 'anxiety', 'asks', 'migraine', 'team', 'via']
Topic 2:
['depression', 'treatments', 'fuck', 'overcome', 'health', 'mental', 'like', 'treatment', 'may', 'one']
Topic 3:
['trump', 'putin', 'man', 'via', 'russia', 'oh', 'like', 'life', 'let', 'get']
Topic 4:
['thanks', 'hey', 'follow', 'see', 'happy', 'could', 'wait', 'want', 'yes', 'wish']
Topic 5:
['like', 'make', 'watch', 'live', 'family', 'miss', 'one', 'even', 'reason', 'first']
Topic 6:
['thank', 'say', 'twitter', 'following', 'hello', 'anytime', 'one', 'like', 'take', 'real']
Topic 7:
['people', 'im', 'talk', 'someone', 'like', 'really', 'get', 'one', 'hate', 'shit']
Topic 8:
['good', 'get', 'best', 'video', 'bestmusicvideo', 'pillowtalk', 'iheartawards', 'zayn', 'well', 'still']
Topic 9:
['know', 'love', 'like', 'much', 'one', 'god', 'need', 'friends', 'time', 'work']
