# Bag of Words
Created by Owen Fava

In [None]:
import csv
import json
import matplotlib.pyplot as plt
import nltk
import os
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

nltk.download ("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
def get_mental_disorders(dataset_path: str, column: str):
    dataset = pd.read_csv(dataset_path)
    mental_disorders = dataset[column].unique()
    lower_cased_mental_disorders = [x.lower() for x in mental_disorders]
    
    return dataset, lower_cased_mental_disorders

dataset, mental_disorders = get_mental_disorders("data/mental_disorders_reddit.csv", "subreddit")
print("Records retrieved: ", len(dataset))
print(mental_disorders)

In [None]:
def get_mental_disorder_data(mental_disorder: str, dataset: pd.DataFrame):
    data = dataset.loc[dataset["subreddit"].str.lower() == mental_disorder]
    
    return data

anxiety_data = get_mental_disorder_data("anxiety", dataset)
print("Anxiety data retrieved: ", len(anxiety_data))

bpd_data = get_mental_disorder_data("bpd", dataset)
print("BPD data retrieved: ", len(bpd_data))

depression_data = get_mental_disorder_data("depression", dataset)
print("Depression data retrieved: ", len(depression_data))

mental_illness_data = get_mental_disorder_data("mentalillness", dataset)
print("Mental illness data retrieved: ", len(mental_illness_data))

schizophrenia_data = get_mental_disorder_data("schizophrenia", dataset)
print("Schizophrenia data retrieved: ", len(schizophrenia_data))

In [None]:
def combine_data_columns_and_clean(data: str, columns: list[str], new_column_name: str):
    # Read value at specific index
    # print(combined_data.iloc[0][new_column_name])
    combined_data = pd.concat([data[column] for column in columns], ignore_index=True)
    combined_cleaned_data = pd.DataFrame({new_column_name: combined_data}).dropna()

    return combined_cleaned_data

anxiety_data = combine_data_columns_and_clean(anxiety_data, ["title", "selftext"], "anxiety_combined_title_selftext")
print("Anxiety data retrieved: ", len(anxiety_data))

bpd_data = combine_data_columns_and_clean(bpd_data, ["title", "selftext"], "bpd_combined_title_selftext")
print("BPD data retrieved: ", len(bpd_data))

depression_data = combine_data_columns_and_clean(depression_data, ["title", "selftext"], "depression_combined_title_selftext")
print("Depression data retrieved: ", len(depression_data))

mental_illness_data = combine_data_columns_and_clean(mental_illness_data, ["title", "selftext"], "mental_illness_combined_title_selftext")
print("Mental illness data retrieved: ", len(mental_illness_data))

schizophrenia_data = combine_data_columns_and_clean(schizophrenia_data, ["title", "selftext"], "schizophrenia_combined_title_selftext")
print("Schizophrenia data retrieved: ", len(schizophrenia_data))

In [5]:
def tokenize_text(data):
    tokenisation = [word for sentence in data for word in word_tokenize(sentence)]
    
    return tokenisation

anxiety_data = tokenize_text(anxiety_data["anxiety_combined_title_selftext"])
bpd_data = tokenize_text(bpd_data["bpd_combined_title_selftext"])
depression_data = tokenize_text(depression_data["depression_combined_title_selftext"])
mental_illness_data = tokenize_text(mental_illness_data["mental_illness_combined_title_selftext"])
schizophrenia_data = tokenize_text(schizophrenia_data["schizophrenia_combined_title_selftext"])

In [6]:
def case_folding(data, toLower: bool):
    result = []

    if toLower:
        result = [text.lower() for text in data]
    else:
        result = [text.upper() for text in data]

    return result

anxiety_data = case_folding(anxiety_data, True)
bpd_data = case_folding(bpd_data, True)
depression_data = case_folding(depression_data, True)
mental_illness_data = case_folding(mental_illness_data, True)
schizophrenia_data = case_folding(schizophrenia_data, True)

In [7]:
def discard_non_alphabetical_words(data):
    result = [text for text in data if text.isalpha()]
    
    return result

anxiety_data = discard_non_alphabetical_words(anxiety_data)
bpd_data = discard_non_alphabetical_words(bpd_data)
depression_data = discard_non_alphabetical_words(depression_data)
mental_illness_data = discard_non_alphabetical_words(mental_illness_data)
schizophrenia_data = discard_non_alphabetical_words(schizophrenia_data)

In [None]:
# Total count of rows:
total_count_of_rows = len(anxiety_data) + len(bpd_data) + len(depression_data) + len(mental_illness_data)+ len(schizophrenia_data)
print("Total rows: ", total_count_of_rows)

In [None]:
def remove_stop_words(data):
    result = [text for text in data if (not text in stopwords.words("english"))]
    
    return result

# def lemmatize_text(data):
#     lemmatizer = WordNetLemmatizer()
#     lemmas = [lemmatizer.lemmatize(text) for text in data]

#     return lemmas

def clean_data_and_export_to_csv(data, csv_file_name: str, column_name: str):
    removed_stop_words_data = remove_stop_words(data)
    # lemmatized_data = lemmatize_text(removed_stop_words_data)

    if not os.path.splitext(csv_file_name)[1]:
        csv_file_name = csv_file_name + ".csv"
    
    with open(csv_file_name, "w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow([column_name])
        for item in removed_stop_words_data:
            writer.writerow([item])
    
clean_data_and_export_to_csv(anxiety_data, "cleaned_anxiety_data", "anxiety-data")
print("Anxiety data done")

clean_data_and_export_to_csv(bpd_data, "cleaned_bpd_data", "bpd-data")
print("BPD data done")

clean_data_and_export_to_csv(depression_data, "cleaned_depression_data", "depression-data")
print("Depression data done")

clean_data_and_export_to_csv(mental_illness_data, "cleaned_mental_illness_data", "mental-illness-data")
print("Mental Illness data done")

clean_data_and_export_to_csv(schizophrenia_data, "cleaned_schizophrenia_data", "schizophrenia-data")
print("Schizophrenia data done")

In [None]:
# GET DATA FROM CLEANED CSVS
# Read data from CSV file
def read_from_csv_to_list(csv_file_name: str, column_name: str):
    if not os.path.splitext(csv_file_name)[1]:
        csv_file_name = csv_file_name + ".csv"

    data = pd.read_csv(csv_file_name)
    data = data[column_name].tolist()

    return data

anxiety_cleaned_data = read_from_csv_to_list("cleaned_anxiety_data", "anxiety-data")
print(f"Retrived {len(anxiety_cleaned_data)} rows of data for anxiety")

bpd_cleaned_data = read_from_csv_to_list("cleaned_bpd_data", "bpd-data")
print(f"Retrived {len(bpd_cleaned_data)} rows of data for BPD")

depression_cleaned_data = read_from_csv_to_list("cleaned_depression_data", "depression-data")
print(f"Retrived {len(depression_cleaned_data)} rows of data for depression")

mental_illness_cleaned_data = read_from_csv_to_list("cleaned_mental_illness_data", "mental-illness-data")
print(f"Retrived {len(mental_illness_cleaned_data)} rows of data for mental illness")

schizophrenia_cleaned_data = read_from_csv_to_list("cleaned_schizophrenia_data", "schizophrenia-data")
print(f"Retrived {len(schizophrenia_cleaned_data)} rows of data for schizophrenia")

def lemmatize_text_and_write_frequency_to_csv(data, csv_file_name: str):
    if not os.path.splitext(csv_file_name)[1]:
        csv_file_name = csv_file_name + ".csv"
    
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(str(word)) for word in data]

    # Count the frequency of each word
    word_counts = Counter(lemmatized_text)

    # Sort the words by frequency in descending order
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    # Write the results to a CSV file
    with open(csv_file_name, "w", newline="") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Word", "Frequency"])
        for word, count in sorted_words:
            writer.writerow([word, count])

    return lemmatized_text

anxiety_lemmatized_data = lemmatize_text_and_write_frequency_to_csv(anxiety_cleaned_data, "anxiety_bag_of_words")
print("Anxiety data lemmitization done")
print(f"Anxiety lemmmas: {len(anxiety_lemmatized_data)}")

bpd_lemmatized_data = lemmatize_text_and_write_frequency_to_csv(bpd_cleaned_data, "bpd_bag_of_words")
print("BPD data lemmitization done")
print(f"BPD lemmmas: {len(bpd_lemmatized_data)}")

depression_lemmatized_data = lemmatize_text_and_write_frequency_to_csv(depression_cleaned_data, "depression_bag_of_words")
print("Depression data lemmitization done")
print(f"Depressionn lemmmas: {len(depression_lemmatized_data)}")

mental_illness_lemmatized_data = lemmatize_text_and_write_frequency_to_csv(mental_illness_cleaned_data, "mental_illness_bag_of_words")
print("Mental Illness data lemmitization one")
print(f"Mental illness lemmmas: {len(mental_illness_lemmatized_data)}")

schizophrenia_lemmatized_data = lemmatize_text_and_write_frequency_to_csv(schizophrenia_cleaned_data, "schizophrenia_bag_of_words")
print("Schizophrenia data lemmitization done")
print(f"Schizophrenia lemmmas: {len(schizophrenia_lemmatized_data)}")

In [None]:
def generate_bag_of_words_visual(csv_file_name: str):
    if not os.path.splitext(csv_file_name)[1]:
        csv_file_name = csv_file_name + ".csv"
    
    data = pd.read_csv(csv_file_name)

    # Convert the DataFrame to a dictionary
    word_counts = dict(zip(data["Word"], data["Frequency"]))

    # Generate WordCloud
    wordcloud = WordCloud(width=800, height=800, background_color='white').generate_from_frequencies(word_counts)

    plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

generate_bag_of_words_visual("anxiety_bag_of_words")
generate_bag_of_words_visual("bpd_bag_of_words")
generate_bag_of_words_visual("depression_bag_of_words")
generate_bag_of_words_visual("mental_illness_bag_of_words")
generate_bag_of_words_visual("schizophrenia_bag_of_words")