# Semantic Relationship extraction

By christian Spiteri Gauci

## Literature and References

## Method

* Cleaning of data - Remove special characters, punctuation, and extra whitespace
* Tokenise using spacy
* Lemmatize tokens and remove stopwords
* extract relationships - using "nsubj", "ROOT", "dobj"
* Clean the extracted relationships - remove any entries that inlcude only special characters or numbers
* Remove entries that occur only once - and keep those that occur 3times or more in each subreddit 
* Remove entries that occur in more than one subreddit 


In [None]:
# !pip install nltk

In [None]:
import pandas as pd
import spacy
from collections import defaultdict, Counter
import re
import ast
import nltk
from nltk.corpus import words as nltk_words
from lib.sanitze_util import clean_text_batch

# Load English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Download the words corpus
nltk.download("words")

# Get the set of English words from NLTK
english_words = set(nltk_words.words())

In [None]:
# Function for text preprocessing
def clean_text(text):
    # Check if text is a string (handles NaN values)
    if isinstance(text, str):
        text.lower()
    else:
        str(text).lower()  # Convert non-string values to lowercase strings

    # Remove special characters, punctuation, and extra whitespace
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()

    # Tokenize the text using spaCy
    doc = nlp(text)

    # Lemmatize tokens and remove stopwords
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    return tokens

In [None]:

def extract_relations(text):
    MH_NER = [
        "MENTAL_HEALTH_TERM",
        "SYMPTOM",
        "TREATMENT",
    ]
    doc = nlp(text)
    relations = []
    for ent in doc.ents:
        if ent.label_ in MH_NER:
            relations.append((ent.text, ent.label_))
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in ["nsubj", "ROOT", "dobj"]:
                relations.append((token.head.text, token.dep_, token.text))

    return relations

def create_new_features(row):
    title_relations = extract_relations(row["title"])
    text_relations = extract_relations(row["selftext"])
    all_relations = title_relations + text_relations
    return all_relations

In [None]:
# Read the CSV file
df = pd.read_csv("./data/mental_disorders_reddit.csv")

# Create a new DataFrame with specific columns
columns_needed = ["title", "selftext", "subreddit"]
df_subset = df[columns_needed].copy()

# print(df_subset.head())

# Preprocess text columns (title and selftext)
df_subset["title"] = clean_text_batch(df_subset["title"].fillna("").tolist())
df_subset["selftext"] = clean_text_batch(df_subset["selftext"].fillna("").tolist())

# Display a preview the preprocessed DataFrame
print(df_subset.head())

# Apply the function to create a new feature 'semantic_relationships'
df_subset["semantic_relationships"] = df_subset.apply(create_new_features, axis=1)

# Display the updated DataFrame with the new feature
print(df_subset.head())

# convert df to csv for future analysis
df_subset.to_csv("SemanticsRel.csv", index=False)

In [None]:
# Read the CSV file
df_svo = pd.read_csv("./data/SemanticsRel.csv")

# Create a new DataFrame with specific columns
columns_needed = ["subreddit", "semantic_relationships"]
df_svo = df_svo[columns_needed].copy()

# Filter out rows with empty relationships
df_svo = df_svo[df_svo["semantic_relationships"].apply(len) > 0]

# Convert string representations to actual lists of tuples
df_svo["semantic_relationships"] = df_svo["semantic_relationships"].apply(
    ast.literal_eval
)

print(df_svo.head())
# df_svo.to_csv('svo.csv', index=False)

In [None]:
from tqdm import tqdm

clean_data = []


for idx, row in tqdm(df_svo.iterrows()):

    subreddit = row["subreddit"]

    relationships = row["semantic_relationships"]

    for rel in relationships:

        if len(rel) == 3:  # Ensure the tuple has three elements

            word1, dep, word2 = rel

            clean_data.append(
                {
                    "Subreddit": subreddit,
                    "Word1": word1,
                    "Dependency": dep,
                    "Word2": word2,
                }
            )
        else:

            print(f"Issue with relationship: {rel}")


clean_df = pd.DataFrame(clean_data)

print(clean_df)

In [None]:
# Grouping the DataFrame by 'Subreddit', 'Word1', 'Dependency', and 'Word2' and counting occurrences
grouped = (
    clean_df.groupby(["Subreddit", "Word1", "Dependency", "Word2"])
    .size()
    .reset_index(name="Occurrence")
)

# Filter out entries where Word1 and Word2 are not in the English dictionary
# filtered_relationships = multiple_subreddit_relationships[
#     (multiple_subreddit_relationships['Word1'].isin(english_words)) &
#     (multiple_subreddit_relationships['Word2'].isin(english_words))
# ]

# Define a regular expression pattern to match only alphanumeric words
pattern = re.compile(r"^[a-zA-Z]+$")

# Filter out entries where Word1 and Word2 contain only alphanumeric characters
filtered_relationships = grouped[
    (grouped["Word1"].str.match(pattern)) & (grouped["Word2"].str.match(pattern))
]

# Filtering relationships occurring more than three times within each subreddit
common_relationships = filtered_relationships[filtered_relationships["Occurrence"] > 3]


# Grouping the DataFrame by 'Word1', 'Dependency', and 'Word2' to count unique occurrences across subreddits
relationship_counts = (
    common_relationships.groupby(["Word1", "Dependency", "Word2"])
    .agg({"Subreddit": "nunique"})
    .reset_index()
)

# Filter relationships occurring in only one subreddit
single_subreddit_relationships = relationship_counts[
    relationship_counts["Subreddit"] == 1
]

# Merge to keep only the relationships occurring in one subreddit from common_relationships
filtered_common_relationships = pd.merge(
    common_relationships,
    single_subreddit_relationships,
    on=["Word1", "Dependency", "Word2"],
    how="inner",
)

print(filtered_common_relationships)
filtered_common_relationships.to_csv("SemanticsRelFiltered.csv", index=False)

Some entries have word1 and word2 which are not actual words but numbers or mispelled words. Even though there was a chance that these were to be filtered due to occurrence should be more than 3, some weren't. At this point i was going to use the NLTK library to check for english words, but upon implementing, some important words like 'zyprexa' was eliminated since it's not included in the corpus. However this is an important entry and therefore should remain. a more crude filter, removing numbers only entries was implemented.

In [None]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
"zyprexa" in english_vocab