# Semantic Relationship Extraction and Feature Engineering

By christian Spiteri Gauci and Adam Darmanin

## Literature and References

## Method

* Cleaning of data - Remove special characters, punctuation, and extra whitespace
* Tokenise using spacy
* Lemmatize tokens and remove stopwords
* extract relationships - using "nsubj", "ROOT", "dobj"
* Clean the extracted relationships - remove any entries that inlcude only special characters or numbers
* Remove entries that occur only once - and keep those that occur 3times or more in each subreddit 
* Remove entries that occur in more than one subreddit 


In [15]:
import pandas as pd
import numpy as np
import spacy
from collections import defaultdict, Counter
import re
import concurrent.futures


import ast
from tqdm import tqdm
import nltk
from nltk.corpus import words as nltk_words


from lib.sanitze_util import clean_text_batch

tqdm.pandas()  # for progressbase in DFs.

nlp = spacy.load("en_core_web_sm")
nltk.download("words")
nltk.download('vader_lexicon')


english_words = set(nltk_words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Data Preprocessing

1. Clean stopwords
2. Lemmatize
3. Convert emoticons
4. Expand contractions

In [16]:
df = pd.read_csv("./data/mental_disorders_reddit.csv")

# Create a new DataFrame with specific columns
columns_needed = ["title", "selftext", "subreddit"]
df_subset = df[columns_needed].copy()

# Drop anything not a full peice of text in this dataset.
mask = df_subset.applymap(lambda x: isinstance(x, str) and x.strip() != "")
df_subset = df_subset[mask["title"] & mask["selftext"]]
df_subset["raw_title"] = df_subset["title"]
df_subset["raw_selftext"] = df_subset["selftext"]
df_subset["title"] = clean_text_batch(df_subset["title"].fillna("").tolist())
df_subset["selftext"] = clean_text_batch(df_subset["selftext"].fillna("").tolist())

  mask = df_subset.applymap(lambda x: isinstance(x, str) and x.strip() != "")
  soup = BeautifulSoup(text, "html.parser")
  soup = BeautifulSoup(text, "html.parser")
Cleaning Pipeline Token: 100%|██████████| 668031/668031 [01:30<00:00, 7368.33it/s]
Cleaning Pipeline Token: 100%|██████████| 668031/668031 [21:07<00:00, 526.98it/s] 


# Feature Engineering

1. Semantic relations between words
2. Mental health labels in tokens

In [18]:
# borrowed from the research: https://github.com/zhukovanadezhda/psy-ner/tree/main
# Following paper: https://www.researchgate.net/publication/358779855_Deep_Learning-based_Detection_of_Psychiatric_Attributes_from_German_Mental_Health_Records
# see: https://spacy.io/usage/processing-pipelines
psy_ner = spacy.load("./model/psy_ner")
from nltk.sentiment import SentimentIntensityAnalyzer
from empath import Empath

lexicon = Empath()
sia = SentimentIntensityAnalyzer()

MH_NER = [
    "ANXIETY DISORDERS",
    "BIPOLAR DISORDERS",
    "DEPRESSIVE DISORDERS",
    "DISRUPTIVE IMPULSE-CONTROL, AND CONDUCT DISORDERS",
    "DISSOCIATIVE DISORDERS",
    "EATING DISORDERS",
    "NEURO-COGNITIVE DISORDERS",
    "NEURO-DEVELOPMENTAL DISORDERS",
    "OBSESSIVE-COMPULSIVE AND RELATED DISORDERS",
    "PERSONALITY DISORDERS",
    "PSYCHEDELIC DRUGS",
    "SCHIZOPHRENIA SPECTRUM AND OTHER PSYCHOTIC DISORDERS",
    "SEXUAL DYSFUNCTIONS",
    "SLEEP-WAKE DISORDERS",
    "SOMATIC SYMPTOM RELATED DISORDERS",
    "SUBSTANCE-RELATED DISORDERS",
    "SYMPTOMS",
    "TRAUMA AND STRESS RELATED DISORDERS",
]


def create_rel_feature(row):
    def _extract_relations(text):
        relations = []
        if not isinstance(text, str):
            return relations
        doc = nlp(text)

        for sent in doc.sents:
            for token in sent:
                if token.dep_ in ["nsubj", "dobj"]:
                    relations.append((token.head.text, token.dep_, token.text))
        return relations

    title_relations = _extract_relations(row["title"])
    text_relations = _extract_relations(row["selftext"])
    all_relations = title_relations + text_relations
    return all_relations


def create_psylabel_feature(row):
    def _extract_psy_labels(text):
        mh_labels = {}
        if not isinstance(text, str):
            return mh_labels

        doc = psy_ner(text)
        for ent in doc.ents:
            if ent.label_ in MH_NER:
                if ent.label_ not in mh_labels:
                    mh_labels[ent.label_] = set()
                mh_labels[ent.label_].add(ent.text)
        for label in mh_labels:
            mh_labels[label] = list(mh_labels[label])

        return mh_labels

    combined_text = row["title"] + " " + row["selftext"]
    combined_labels = _extract_psy_labels(combined_text)
    return combined_labels


# See: https://github.com/Ejhfast/empath-client/tree/master
EMPATH_CATS = [
    "help",
    "violence",
    "sleep",
    "medical_emergency",
    "cold",
    "hate",
    "cheerfulness",
    "aggression",
    "envy",
    "anticipation",
    "health",
    "pride",
    "nervousness",
    "weakness",
    "horror",
    "swearing_terms",
    "suffering",
    "sexual",
    "fear",
    "monster",
    "irritability",
    "exasperation",
    "ridicule",
    "neglect",
    "fight",
    "dominant_personality",
    "injury",
    "rage",
    "science",
    "work",
    "optimism",
    "warmth",
    "sadness",
    "emotional",
    "joy",
    "shame",
    "torment",
    "anger",
    "strength",
    "ugliness",
    "pain",
    "negative_emotion",
    "positive_emotion",
]


def create_sentiment_feature(row):
    def _get_vader_sentiment(text):
        score = sia.polarity_scores(text)
        return score["compound"] if score is not None else np.NaN

    combined_text = row["title"] + " " + row["selftext"]
    combined_labels = _get_vader_sentiment(combined_text)
    return combined_labels


def create_emotional_categories_scores_feature(row):
    def _get_empath_sentiment(text):
        scores = lexicon.analyze(text, categories=EMPATH_CATS, normalize=True)
        if scores is not None:
            return {category: round(score, 2) for category, score in scores.items()}
        else:
            return {}  # Return an empty dictionary if scores is None

    combined_text = row["title"] + " " + row["selftext"]
    combined_labels = _get_empath_sentiment(combined_text)
    return combined_labels


# def create_emotional_categories_scores_feature(row):
#     def _get_empath_sentiment(text):
#         scores = lexicon.analyze(text, categories=EMPATH_CATS, normalize=True)
#         return {category: round(score, 2) for category, score in scores.items()}

#     combined_text = row["title"] + " " + row["selftext"]
#     combined_labels = _get_empath_sentiment(combined_text)
#     return combined_labels


df_subset["psy_labels"] = df_subset.progress_apply(create_psylabel_feature, axis=1)
df_subset["sentiment"] = df_subset.progress_apply(create_sentiment_feature, axis=1)
df_subset["emotional_categories"] = df_subset.progress_apply(
    create_emotional_categories_scores_feature, axis=1
)
df_subset["semantic_relationships"] = df_subset.progress_apply(
    create_rel_feature, axis=1
)


# Cache it for further processing down the line.
df_subset.to_csv("./data/SemanticsRel.csv", index=False)
df_subset[
    [
        "title",
        "selftext",
        "psy_labels",
        "sentiment",
        "emotional_categories",
        "semantic_relationships",
    ]
].head(15)

100%|██████████| 668031/668031 [22:10<00:00, 502.20it/s]
100%|██████████| 668031/668031 [03:49<00:00, 2905.22it/s]
100%|██████████| 668031/668031 [08:57<00:00, 1242.07it/s] 
100%|██████████| 668031/668031 [1:25:42<00:00, 129.91it/s]


Unnamed: 0,title,selftext,psy_labels,sentiment,emotional_categories,semantic_relationships
0,life pointless,think important life relationship like absolut...,{},0.9501,"{'help': 0.0, 'violence': 0.0, 'sleep': 0.0, '...","[(ask, nsubj, relationship), (ask, dobj, goals)]"
1,cold rage,hello fellow friends bpd spectrum discouraged ...,{'SYMPTOMS': ['anger']},-0.984,"{'help': 0.01, 'violence': 0.01, 'sleep': 0.01...","[(discouraged, nsubj, spectrum), (discouraged,..."
2,know,f20 bf m20 told today said wish better likes n...,{},0.8126,"{'help': 0.0, 'violence': 0.0, 'sleep': 0.0, '...","[(told, nsubj, m20), (said, nsubj, think), (kn..."
3,help opinions advice,okay open things past proud person anymore sto...,{'SYMPTOMS': ['anxiety']},0.9919,"{'help': 0.02, 'violence': 0.02, 'sleep': 0.0,...","[(help, dobj, opinions), (help, dobj, advice),..."
4,help,removed,{},0.4019,"{'help': 0.5, 'violence': 0.0, 'sleep': 0.0, '...",[]
5,ex got diagnosed bpd,going detail diagnosis explains sudden break u...,{},0.9611,"{'help': 0.03, 'violence': 0.03, 'sleep': 0.0,...","[(going, dobj, diagnosis), (explains, dobj, up..."
6,misdiagnosis bpd common asking diagnosed years...,reposting larger sub recommendation people sma...,{'NEURO-DEVELOPMENTAL DISORDERS': ['autism']},0.9231,"{'help': 0.0, 'violence': 0.01, 'sleep': 0.0, ...","[(think, nsubj, bpd), (asking, dobj, years), (..."
7,trouble identifying sexual orientation bpd lik...,grew dating men realized teenager like women k...,{},0.9349,"{'help': 0.0, 'violence': 0.03, 'sleep': 0.0, ...","[(likes, nsubj, trouble), (identifying, dobj, ..."
8,needing advice,posted sub earlier today having trouble believ...,{'SYMPTOMS': ['anxiety']},-0.7003,"{'help': 0.0, 'violence': 0.02, 'sleep': 0.0, ...","[(needing, dobj, advice), (posted, dobj, sub),..."
9,bpd,removed,{},0.0,"{'help': 0.0, 'violence': 0.0, 'sleep': 0.0, '...",[]


# Create the Nodes

Nodes have edges, attributes for MH disorders, wieghts and sentiment.

In [None]:
clean_data = []

for idx, row in tqdm(df_subset.iterrows()):
    subreddit = row["subreddit"]
    labels = row["psy_labels"]
    relationships = row["semantic_relationships"]
    for rel in relationships:
        if len(rel) == 3:  # Ensure the tuple has three elements
            word1, dep, word2 = rel
            clean_data.append(
                {
                    "Subreddit": subreddit,
                    "Word1": word1,
                    "Dependency": dep,
                    "Word2": word2,
                    "MHlabels": labels,
                }
            )
        else:
            print(f"Issue with relationship: {rel}")

clean_df = pd.DataFrame(clean_data)
clean_df.to_csv("./data/GraphData.csv", index=False)
clean_df.head(15)

668031it [00:28, 23093.33it/s]


Unnamed: 0,Subreddit,Word1,Dependency,Word2,MHlabels
0,BPD,ask,nsubj,relationship,{}
1,BPD,ask,dobj,goals,{}
2,BPD,discouraged,nsubj,spectrum,{'SYMPTOMS': ['anger']}
3,BPD,discouraged,dobj,characteristics,{'SYMPTOMS': ['anger']}
4,BPD,wondering,nsubj,levels,{'SYMPTOMS': ['anger']}
5,BPD,experiencing,dobj,anger,{'SYMPTOMS': ['anger']}
6,BPD,found,nsubj,way,{'SYMPTOMS': ['anger']}
7,BPD,found,dobj,blame,{'SYMPTOMS': ['anger']}
8,BPD,find,nsubj,understanding,{'SYMPTOMS': ['anger']}
9,BPD,tend,nsubj,ones,{'SYMPTOMS': ['anger']}


In [None]:
# Grouping the DataFrame by 'Subreddit', 'Word1', 'Dependency', and 'Word2' and counting occurrences
grouped = (
    clean_df.groupby(["Subreddit", "Word1", "Dependency", "Word2"])
    .size()
    .reset_index(name="Occurrence")
)

# Filter out entries where Word1 and Word2 are not in the English dictionary
# filtered_relationships = multiple_subreddit_relationships[
#     (multiple_subreddit_relationships['Word1'].isin(english_words)) &
#     (multiple_subreddit_relationships['Word2'].isin(english_words))
# ]

# Define a regular expression pattern to match only alphanumeric words
pattern = re.compile(r"^[a-zA-Z]+$")

# Filter out entries where Word1 and Word2 contain only alphanumeric characters
filtered_relationships = grouped[
    (grouped["Word1"].str.match(pattern)) & (grouped["Word2"].str.match(pattern))
]

# Filtering relationships occurring more than three times within each subreddit
common_relationships = filtered_relationships[filtered_relationships["Occurrence"] > 3]


# Grouping the DataFrame by 'Word1', 'Dependency', and 'Word2' to count unique occurrences across subreddits
relationship_counts = (
    common_relationships.groupby(["Word1", "Dependency", "Word2"])
    .agg({"Subreddit": "nunique"})
    .reset_index()
)

# Filter relationships occurring in only one subreddit
single_subreddit_relationships = relationship_counts[
    relationship_counts["Subreddit"] == 1
]

# Merge to keep only the relationships occurring in one subreddit from common_relationships
filtered_common_relationships = pd.merge(
    common_relationships,
    single_subreddit_relationships,
    on=["Word1", "Dependency", "Word2"],
    how="inner",
)

print(filtered_common_relationships)
filtered_common_relationships.to_csv("SemanticsRelFiltered.csv", index=False)

          Subreddit_x    Word1 Dependency           Word2  Occurrence  \
0             Anxiety     able       dobj         anxiety          10   
1             Anxiety     able       dobj          breath           4   
2             Anxiety     able       dobj         breathe          14   
3             Anxiety     able       dobj           sleep           4   
4             Anxiety     able      nsubj         anxiety          12   
...               ...      ...        ...             ...         ...   
104521  schizophrenia    worms      nsubj            tear           5   
104522  schizophrenia    worse      nsubj  hallucinations           6   
104523  schizophrenia    worse      nsubj          voices           6   
104524  schizophrenia    write       dobj   schizophrenia           4   
104525  schizophrenia  writing       dobj   schizophrenia          12   

        Subreddit_y  
0                 1  
1                 1  
2                 1  
3                 1  
4            

Some entries have word1 and word2 which are not actual words but numbers or mispelled words. Even though there was a chance that these were to be filtered due to occurrence should be more than 3, some weren't. At this point i was going to use the NLTK library to check for english words, but upon implementing, some important words like 'zyprexa' was eliminated since it's not included in the corpus. However this is an important entry and therefore should remain. a more crude filter, removing numbers only entries was implemented.

In [None]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
"zyprexa" in english_vocab

False