In [9]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
     - -------------------------------------- 0.1/1.5 MB 656.4 kB/s eta 0:00:03
     --- ------------------------------------ 0.1/1.5 MB 901.1 kB/s eta 0:00:02
     ----- ---------------------------------- 0.2/1.5 MB 1.1 MB/s eta 0:00:02
     -------- ------------------------------- 0.3/1.5 MB 1.4 MB/s eta 0:00:01
     -------- ------------------------------- 0.3/1.5 MB 1.4 MB/s eta 0:00:01
     ------------- -------------------------- 0.5/1.5 MB 1.5 MB/s eta 0:00:01
     -------------- ------------------------- 0.5/1.5 MB 1.6 MB/s eta 0:00:01
     ------------------ --------------------- 0.7/1.5 MB 1.7 MB/s eta 0:00:01
     --------------------- ------------------ 0.8/1.5 MB 1.8 MB/s eta 0:00:01
     --


[notice] A new release of pip is available: 23.2.1 -> 23.3.2
[notice] To update, run: C:\Users\chris\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [11]:
import pandas as pd
import spacy
from collections import defaultdict, Counter
import re
import ast
import nltk
from nltk.corpus import words as nltk_words

# Load English language model in spaCy
nlp = spacy.load("en_core_web_sm")

# Download the words corpus
nltk.download('words')

# Get the set of English words from NLTK
english_words = set(nltk_words.words())

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


In [17]:
# Function for text preprocessing 
def clean_text(text):

    # Check if text is a string (handles NaN values)
    if isinstance(text, str):
         text.lower()
    else:
        str(text).lower()  # Convert non-string values to lowercase strings
    
    # Remove special characters, punctuation, and extra whitespace
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize the text using spaCy
    doc = nlp(text)
    
    # Lemmatize tokens and remove stopwords
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return tokens

In [23]:
def clean_text_batch(texts):
    cleaned_texts = []
    
    for doc in nlp.pipe(texts, batch_size=128):
        # Initialize a list to store cleaned tokens for each document
        cleaned_tokens = []
        
        for token in doc:
            # Preprocess each token: convert to lowercase, remove special characters
            processed_token = re.sub(r'[^a-zA-Z0-9\s]', '', token.text.strip().lower())
            
            # Skip stop words and append cleaned tokens to the list
            if not token.is_stop and processed_token:
                cleaned_tokens.append(processed_token)
        
        # Join cleaned tokens to form the cleaned text for each document
        cleaned_text = ' '.join(cleaned_tokens)
        cleaned_texts.append(cleaned_text)
    
    return cleaned_texts

In [18]:
# Function to extract semantic relationships
def extract_relations(text):
    doc = nlp(text)
    relations = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in ["nsubj", "ROOT", "dobj"]:  # Adjust based on what relationships you want
                # Append the relationship tuple (head, dep, child) to the list
                relations.append((token.head.text, token.dep_, token.text))
    return relations

# Create new features based on semantic relationships
def create_new_features(row):
    title_relations = extract_relations(row['title'])
    text_relations = extract_relations(row['selftext'])
    
    # Combine the relationships from title and text
    all_relations = title_relations + text_relations
    
    return all_relations


In [24]:
# Read the CSV file
df = pd.read_csv('mental_disorders_reddit.csv')

# Create a new DataFrame with specific columns
columns_needed = ['title', 'selftext', 'subreddit']
df_subset = df[columns_needed].copy()

# print(df_subset.head())

# Preprocess text columns (title and selftext)
df_subset['title'] = clean_text_batch(df_subset['title'].fillna('').tolist())
df_subset['selftext'] = clean_text_batch(df_subset['selftext'].fillna('').tolist())

# Display a preview the preprocessed DataFrame
print(df_subset.head())

# Apply the function to create a new feature 'semantic_relationships'
df_subset['semantic_relationships'] = df_subset.apply(create_new_features, axis=1)

# Display the updated DataFrame with the new feature
print(df_subset.head())

#convert df to csv for future analysis
df_subset.to_csv('SemanticsRel.csv', index=False)

                                 title  \
0  Life is so pointless without others   
1                           Cold rage?   
2                I don’t know who I am   
3              HELP! Opinions! Advice!   
4                                 help   

                                            selftext subreddit  
0  Does anyone else think the most important part...       BPD  
1  Hello fellow friends 😄\n\nI'm on the BPD spect...       BPD  
2  My [F20] bf [M20] told me today (after I said ...       BPD  
3  Okay, I’m about to open up about many things I...       BPD  
4                                          [removed]       BPD  


In [2]:
# Read the CSV file
df_svo = pd.read_csv('SemanticsRel.csv')

# Create a new DataFrame with specific columns
columns_needed = ['subreddit','semantic_relationships']
df_svo = df_svo[columns_needed].copy()

# Filter out rows with empty relationships
df_svo = df_svo[df_svo['semantic_relationships'].apply(len) > 0]

# Convert string representations to actual lists of tuples
df_svo['semantic_relationships'] = df_svo['semantic_relationships'].apply(ast.literal_eval)

print(df_svo.head())
# df_svo.to_csv('svo.csv', index=False)

  subreddit                             semantic_relationships
0       BPD  [(pointless, ROOT, pointless), (think, ROOT, t...
1       BPD  [(rage, ROOT, rage), (discouraged, nsubj, spec...
2       BPD  [(know, ROOT, know), (told, nsubj, m20), (told...
3       BPD  [(help, ROOT, help), (help, dobj, opinions), (...
4       BPD     [(help, ROOT, help), (removed, ROOT, removed)]


In [4]:
clean_data = []

for idx, row in df_svo.iterrows():
    subreddit = row['subreddit']
    relationships = row['semantic_relationships']
    
    for rel in relationships:
        if len(rel) == 3:  # Ensure the tuple has three elements
            word1, dep, word2 = rel
            clean_data.append({'Subreddit': subreddit, 'Word1': word1, 'Dependency': dep, 'Word2': word2})
        else:
            print(f"Issue with relationship: {rel}")

clean_df = pd.DataFrame(clean_data)
print(clean_df)

             Subreddit      Word1 Dependency         Word2
0                  BPD  pointless       ROOT     pointless
1                  BPD      think       ROOT         think
2                  BPD      think       dobj  relationship
3                  BPD        ask      nsubj     therapist
4                  BPD    imagine      nsubj         goals
...                ...        ...        ...           ...
9979456  mentalillness      think      nsubj           run
9979457  mentalillness     having       dobj         think
9979458  mentalillness       kill       dobj       uranium
9979459  mentalillness  construct       dobj            fu
9979460  mentalillness       fuck       ROOT          fuck

[9979461 rows x 4 columns]


In [24]:
# Grouping the DataFrame by 'Subreddit', 'Word1', 'Dependency', and 'Word2' and counting occurrences
grouped = clean_df.groupby(['Subreddit', 'Word1', 'Dependency', 'Word2']).size().reset_index(name='Occurrence')

# Filter out entries where Word1 and Word2 are not in the English dictionary
# filtered_relationships = multiple_subreddit_relationships[
#     (multiple_subreddit_relationships['Word1'].isin(english_words)) &
#     (multiple_subreddit_relationships['Word2'].isin(english_words))
# ]

# Define a regular expression pattern to match only alphanumeric words
pattern = re.compile(r'^[a-zA-Z]+$')

# Filter out entries where Word1 and Word2 contain only alphanumeric characters
filtered_relationships = grouped[
    (grouped['Word1'].str.match(pattern)) &
    (grouped['Word2'].str.match(pattern))
]

# Filtering relationships occurring more than three times within each subreddit
common_relationships = filtered_relationships[filtered_relationships['Occurrence'] > 3]


# Grouping the DataFrame by 'Word1', 'Dependency', and 'Word2' to count unique occurrences across subreddits
relationship_counts = common_relationships.groupby(['Word1', 'Dependency', 'Word2']).agg({'Subreddit': 'nunique'}).reset_index()

# Filter relationships occurring in only one subreddit
single_subreddit_relationships = relationship_counts[relationship_counts['Subreddit'] == 1]

# Merge to keep only the relationships occurring in one subreddit from common_relationships
filtered_common_relationships = pd.merge(common_relationships, single_subreddit_relationships, on=['Word1', 'Dependency', 'Word2'], how='inner')

print(filtered_common_relationships)
filtered_common_relationships.to_csv('SemanticsRelFiltered.csv', index=False)









# # Count occurrences of SVO relationships within each subreddit
# subreddit_counts = defaultdict(Counter)
# for idx, row in df_svo.iterrows():
#     subreddit = row['subreddit']
#     for svo in row['semantic_relationships']:
#         subreddit_counts[subreddit][svo] += 1

# # Filter SVO relationships with count > 3 within each subreddit
# frequent_relationships = {}
# for subreddit, svo_counter in subreddit_counts.items():
#     frequent_relationships[subreddit] = {
#         svo: count for svo, count in svo_counter.items() if count > 3
#     }

# # Display frequent relationships per subreddit
# for subreddit, relationships in frequent_relationships.items():
#     print(f"Subreddit: {subreddit}")
#     for svo, count in relationships.items():
#         print(f"SVO: {svo}, Count: {count}")

          Subreddit_x    Word1 Dependency           Word2  Occurrence  \
0             Anxiety  abdomen       ROOT         abdomen           5   
1             Anxiety     abit       ROOT            abit           4   
2             Anxiety     able       dobj         anxiety          12   
3             Anxiety     able       dobj          breath           6   
4             Anxiety     able       dobj         breathe          14   
...               ...      ...        ...             ...         ...   
109713  schizophrenia    worms      nsubj            skin           5   
109714  schizophrenia    worse      nsubj  hallucinations           6   
109715  schizophrenia    worse      nsubj          voices           5   
109716  schizophrenia  writing       dobj   schizophrenia          13   
109717  schizophrenia  written       dobj           words           4   

        Subreddit_y  
0                 1  
1                 1  
2                 1  
3                 1  
4            

Some entries have word1 and word2 which are not actual words but numbers or mispelled words. Even though there was a chance that these were to be filtered due to occurrence should be more than 3, some weren't. At this point i was going to use the NLTK library to check for english words, but upon implementing, some important words like 'zyprexa' was eliminated since it's not included in the corpus. However this is an important entry and therefore should remain. a more crude filter, removing numbers only entries was implemented.

In [19]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
'zyprexia' in english_vocab

False