In [10]:
import pandas as pd
import spacy
from collections import defaultdict, Counter
import re


# Load English language model in spaCy
nlp = spacy.load("en_core_web_sm")

In [17]:
# Function for text preprocessing 
def clean_text(text):

    # Check if text is a string (handles NaN values)
    if isinstance(text, str):
         text.lower()
    else:
        str(text).lower()  # Convert non-string values to lowercase strings
    
    # Remove special characters, punctuation, and extra whitespace
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Tokenize the text using spaCy
    doc = nlp(text)
    
    # Lemmatize tokens and remove stopwords
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    
    return tokens

In [23]:
def clean_text_batch(texts):
    cleaned_texts = []
    
    for doc in nlp.pipe(texts, batch_size=128):
        # Initialize a list to store cleaned tokens for each document
        cleaned_tokens = []
        
        for token in doc:
            # Preprocess each token: convert to lowercase, remove special characters
            processed_token = re.sub(r'[^a-zA-Z0-9\s]', '', token.text.strip().lower())
            
            # Skip stop words and append cleaned tokens to the list
            if not token.is_stop and processed_token:
                cleaned_tokens.append(processed_token)
        
        # Join cleaned tokens to form the cleaned text for each document
        cleaned_text = ' '.join(cleaned_tokens)
        cleaned_texts.append(cleaned_text)
    
    return cleaned_texts

In [18]:
# Function to extract semantic relationships
def extract_relations(text):
    doc = nlp(text)
    relations = []
    for sent in doc.sents:
        for token in sent:
            if token.dep_ in ["nsubj", "ROOT", "dobj"]:  # Adjust based on what relationships you want
                # Append the relationship tuple (head, dep, child) to the list
                relations.append((token.head.text, token.dep_, token.text))
    return relations

# Create new features based on semantic relationships
def create_new_features(row):
    title_relations = extract_relations(row['title'])
    text_relations = extract_relations(row['selftext'])
    
    # Combine the relationships from title and text
    all_relations = title_relations + text_relations
    
    return all_relations


In [24]:
# Read the CSV file
df = pd.read_csv('mental_disorders_reddit.csv')

# Create a new DataFrame with specific columns
columns_needed = ['title', 'selftext', 'subreddit']
df_subset = df[columns_needed].copy()

# print(df_subset.head())

# Preprocess text columns (title and selftext)
df_subset['title'] = clean_text_batch(df_subset['title'].fillna('').tolist())
df_subset['selftext'] = clean_text_batch(df_subset['selftext'].fillna('').tolist())

# Display a preview the preprocessed DataFrame
print(df_subset.head())

# Apply the function to create a new feature 'semantic_relationships'
df_subset['semantic_relationships'] = df_subset.apply(create_new_features, axis=1)

# Display the updated DataFrame with the new feature
print(df_subset.head())

#convert df to csv for future analysis
df_subset.to_csv('SemanticsRel.csv', index=False)

                                 title  \
0  Life is so pointless without others   
1                           Cold rage?   
2                I don’t know who I am   
3              HELP! Opinions! Advice!   
4                                 help   

                                            selftext subreddit  
0  Does anyone else think the most important part...       BPD  
1  Hello fellow friends 😄\n\nI'm on the BPD spect...       BPD  
2  My [F20] bf [M20] told me today (after I said ...       BPD  
3  Okay, I’m about to open up about many things I...       BPD  
4                                          [removed]       BPD  


In [9]:
# Read the CSV file
df_svo = pd.read_csv('SemanticsRel.csv')

# Create a new DataFrame with specific columns
columns_needed = ['subreddit','semantic_relationships']
df_svo = df_svo[columns_needed].copy()

# Filter out rows with empty relationships
df_svo = df_svo[df_svo['semantic_relationships'].apply(len) > 0]

print(df_svo.head())
# df_svo.to_csv('svo.csv', index=False)

#still need to do some filtering

# # Count occurrences of SVO relationships within each subreddit
# subreddit_counts = defaultdict(Counter)
# for idx, row in df_svo.iterrows():
#     subreddit = row['subreddit']
#     for svo in row['semantic_relationships']:
#         subreddit_counts[subreddit][svo] += 1

# # Filter SVO relationships with count > 3 within each subreddit
# frequent_relationships = {}
# for subreddit, svo_counter in subreddit_counts.items():
#     frequent_relationships[subreddit] = {
#         svo: count for svo, count in svo_counter.items() if count > 3
#     }

# # Display frequent relationships per subreddit
# for subreddit, relationships in frequent_relationships.items():
#     print(f"Subreddit: {subreddit}")
#     for svo, count in relationships.items():
#         print(f"SVO: {svo}, Count: {count}")

            subreddit                             semantic_relationships
0                 BPD  [('is', 'nsubj', 'life'), ('is', 'ROOT', 'is')...
1                 BPD  [('rage', 'ROOT', 'rage'), ("'m", 'nsubj', 'fr...
2                 BPD  [('know', 'nsubj', 'i'), ('know', 'ROOT', 'kno...
3                 BPD  [('help', 'ROOT', 'help'), ('opinions', 'ROOT'...
4                 BPD  [('help', 'ROOT', 'help'), ('removed', 'ROOT',...
...               ...                                                ...
701782  mentalillness  [('go', 'nsubj', 'you'), ('go', 'ROOT', 'go'),...
701783  mentalillness  [('am', 'nsubj', 'i'), ('am', 'ROOT', 'am'), (...
701784  mentalillness  [('look', 'nsubj', 'i'), ('look', 'ROOT', 'loo...
701785  mentalillness  [('is', 'nsubj', '|'), ('is', 'ROOT', 'is'), (...
701786  mentalillness  [('motherfucker', 'ROOT', 'motherfucker'), ('h...

[701787 rows x 2 columns]
