# Bag of Words
Created by Owen Fava

In [None]:
import json
import matplotlib.pyplot as plt
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

nltk.download ("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

In [None]:
def get_and_clean_dataset(mental_health_disorder: str, dataset_path: str, features: list[str]):
    dataset = pd.read_csv(dataset_path)

    cleaned_dataset = pd.DataFrame()
    for feature in features:
        cleaned_dataset[feature] = dataset[feature]
    
    return cleaned_dataset.dropna()

cleaned_dataset = get_and_clean_dataset("depression", "data/depression-sample.csv", ["title", "selftext"])
cleaned_dataset.head(25)

In [None]:
def tokenize_text(data):
    tokenisation = [word for sentence in data for word in word_tokenize(sentence)]
    return tokenisation

title_data =  (tokenize_text(cleaned_dataset["title"]))
selftext_data =  (tokenize_text(cleaned_dataset["selftext"]))
dataset = title_data + selftext_data
print(dataset)

In [None]:
def case_folding(data, toLower: bool):
    result = []

    if toLower:
        result = [text.lower() for text in data]
    else:
        result = [text.upper() for text in data]

    return result

dataset = case_folding(dataset, True)
print(dataset)

In [None]:
def discard_non_alphabetical_words(data):
    result = [text for text in data if text.isalpha()]
    return result

print(len(dataset))
dataset = discard_non_alphabetical_words(dataset)
print(dataset)
print(len(dataset))

In [None]:
def remove_stop_words(data):
    result = [text for text in data if (not text in stopwords.words("english"))]
    return result

print(len(dataset))
dataset = remove_stop_words(dataset)
print(dataset)
print(len(dataset))

In [None]:
def lemmatize_text(data):
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(text) for text in data]

    return lemmas

def get_lemmas_stats(lemmas):
    stats = []

    for term in lemmas:
        term_frequency = list(filter(lambda t: t["term"] == term, stats))

        if(len(term_frequency) > 0):
            term_frequency = term_frequency[0]
        else:
            term_frequency = { "term": term, "frequency": 0 }
            stats.append(term_frequency)

        term_frequency["frequency"] += 1

    return sorted(stats, key=lambda f: f["frequency"], reverse=True)

lemmas = lemmatize_text(dataset)
print(len(get_lemmas_stats(lemmas)));
print(json.dumps(get_lemmas_stats(lemmas), indent=2));

In [None]:
def generate_bag_of_words(data):
    vectorizer = CountVectorizer()
    x = vectorizer.fit_transform(data)

    word_frequencies = list(zip(vectorizer.get_feature_names_out(), vectorizer.transform(data).sum(axis=0).tolist()[0]))
    word_frequencies.sort(key=lambda x: x[1], reverse=True)

    return x.toarray(), vectorizer.get_feature_names_out(), word_frequencies

bag_of_words_matrix, feature_names, word_frequencies = generate_bag_of_words(dataset)
print("Bag of Words Matrix: \n", bag_of_words_matrix)
print("\nFeature Names: \n", feature_names)
print("\nWord Frequencies: \n", word_frequencies)

In [None]:
def generate_bag_of_words_visual(data):
    words = []
    for text in data:
        words.append(text)

    visual = ' '.join(words)

    wordcloud = WordCloud(width=1000, height=600, background_color="white").generate(visual)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

generate_bag_of_words_visual(dataset)

In [None]:
def generate_pos_tagging(data):
    tokenized_text = tokenize_text(data)
    pos_tags = nltk.pos_tag(tokenized_text)
    
    return pos_tags

tags = generate_pos_tagging(cleaned_dataset["title"] + cleaned_dataset["selftext"])
print(tags)

In [None]:
def extract_noun_phrases(tags):
    regex = "NP: {<DT>?<JJ>*<NN.*>+}"
    parser = nltk.RegexpParser(regex)
    noun_phrase_tree = parser.parse(tags)

    phrases = []

    for node in noun_phrase_tree.subtrees ():
        if (node.label() == "NP"):
            phrases.append (" ".join(w for w,t in node.leaves()))

    return phrases

noun_phrases = extract_noun_phrases(tags)
print(noun_phrases)

def extract_key_phrases(tags):
    regex = """ NP: {<DT>?<JJ>*<NN.*>+} 
                KP: {<JJ.*>*<NP>+<IN>*<VBG>*<JJ.*|NP>*} 
            """

    parser = nltk.RegexpParser(regex)
    key_phrase_tree = parser.parse(tags)

    phrases = []

    for node in key_phrase_tree.subtrees ():
        if (node.label() == "KP"):
            phrases.append (" ".join(w for w,t in node.leaves()));

    return phrases

key_phrases = extract_key_phrases(tags)
print(key_phrases)

In [None]:
def generate_ngrams(text, ngram_value):
    ngram = list(ngrams(text, ngram_value))
    return ngram

print("Bigrams: \n")
print(generate_ngrams(dataset, 2))

print("Trigrams: \n")
print(generate_ngrams(dataset, 3))