# New data creation notebook

## Import

In [15]:
import re
from string import punctuation

import emoji
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import textstat
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from tqdm import tqdm

## Load Data

In [2]:
posts_df = pd.read_csv('data\\all_posts-merged.csv')
comments_df = pd.read_csv('data\\all_comments-merged.csv')
users_df = pd.read_csv('data\\user_data-merged.csv')

In [3]:
display(posts_df.head(1))
display(comments_df.head(1))
display(users_df.head(1))

Unnamed: 0,subreddit,username,name,title,text,is_original_content,num_comments,score,upvote_ratio,date
0,AskReddit,elevate-digital,t3_1fxlnxv,"Guys with extremely loud vehicles, why do you ...",,False,8421,20664,0.87,2024-10-06 17:20:58


Unnamed: 0,subreddit,username,body,post_title,score,num_replies,is_submitter,id,parent_id,stickied,date
0,AskReddit,thedudear,I used to own an insanely loud vehicle. A Pont...,"Guys with extremely loud vehicles, why do you ...",2045,6,False,lqpnfpr,t3_1fxlnxv,False,2024-10-07 01:35:59


Unnamed: 0,username,link_karma,comment_karma,account_age,is_verified
0,yakfsh1,19830.0,88232.0,1440.0,True


In [None]:
users_with_none_link_karma = users_df[users_df['link_karma'].isna()]

display(users_with_none_link_karma)

Unnamed: 0,username,link_karma,comment_karma,account_age,is_verified
248,chiskss_,,,,
402,Only_Lover8724,,,,
458,INeedARemoteJrCSJob,,,,
474,Bussy_Stank,,,,
489,MidnightEye02,,,,
...,...,...,...,...,...
194586,battlerazzle01,,,,
194587,Brickolator,,,,
194588,Financial-Play-7562,,,,
194672,Lingering_Dorkness,,,,


In [5]:
comments_df = comments_df.dropna(subset=['body'])

In [6]:
posts_df.columns, comments_df.columns, users_df.columns

(Index(['subreddit', 'username', 'name', 'title', 'text', 'is_original_content',
        'num_comments', 'score', 'upvote_ratio', 'date'],
       dtype='object'),
 Index(['subreddit', 'username', 'body', 'post_title', 'score', 'num_replies',
        'is_submitter', 'id', 'parent_id', 'stickied', 'date'],
       dtype='object'),
 Index(['username', 'link_karma', 'comment_karma', 'account_age',
        'is_verified'],
       dtype='object'))

In [7]:
posts_df.shape, comments_df.shape, users_df.shape

((1500, 10), (358545, 11), (194750, 5))

### Vector

In [8]:
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

def add_tfidf_vectors(comments_df):
    comments_df['cleaned_body'] = comments_df['body'].apply(clean_text)
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(comments_df['cleaned_body'])
    tfidf_matrix = normalize(tfidf_matrix)
    tfidf_vectors = [vec for vec in tfidf_matrix]
    comments_df['vector'] = tfidf_vectors
    
    return comments_df

In [9]:
comments_df = add_tfidf_vectors(comments_df)

## Feature creation

In [10]:
new_features = pd.DataFrame(users_df['username'].copy())

### Cosine Similarity

In [35]:
def remove_zwj(comment):
    zwj = '\u200d'
    return comment.replace(zwj, '')


def is_weird_comment(comments):
    if all(len(comment) <= 1 or all(char in punctuation for char in comment) for comment in comments):
        return True
    if all(len(emoji.emoji_list(comment)) == 1 and emoji.emoji_list(comment)[0]['emoji'] == comment for comment in comments):
        return True
    if all(all(emoji.is_emoji(char) for char in comment) for comment in comments):
        return True
    if all(len(emoji.emoji_list(comment)) == len([char for char in comment if emoji.is_emoji(char)]) for comment in comments):
        return True
    if all(all(char in emoji.EMOJI_DATA for char in remove_zwj(comment)) for comment in comments):
        return True

    return False


def calculate_avg_cosine_similarity(comments_df):
    vectorizer = TfidfVectorizer()
    grouped_comments = comments_df.groupby('username')['body'].apply(list)
    avg_cosine_similarities = {}

    for username, comments in tqdm(grouped_comments.items(), desc="Processing cosine similarities of comments", total=len(grouped_comments)):
        
        if is_weird_comment(comments) and len(comments) > 1:
            avg_cosine_similarity = 1.0
        elif len(comments) > 1:
            tfidf_matrix = vectorizer.fit_transform(comments)
            cosine_sim_matrix = cosine_similarity(tfidf_matrix)
            avg_cosine_similarity = (cosine_sim_matrix.sum() - len(comments)) / (len(comments) * (len(comments) - 1))
        else:
            avg_cosine_similarity = None

        avg_cosine_similarities[username] = avg_cosine_similarity

    return avg_cosine_similarities


In [36]:
avg_cosine_similarities = calculate_avg_cosine_similarity(comments_df)
avg_cosine_similarities_df = pd.DataFrame(list(avg_cosine_similarities.items()), columns=['username', 'avg_cosine_similarity'])

display(avg_cosine_similarities_df.head())

Processing cosine similarities of comments: 100%|██████████| 180960/180960 [06:33<00:00, 460.16it/s] 


Unnamed: 0,username,avg_cosine_similarity
0,------------------GL,
1,--------rook,1.0
2,------____--------,
3,----_____----,
4,----ryan----,


In [31]:
new_features = new_features.merge(avg_cosine_similarities_df, on='username', how='left')

In [32]:
new_features['avg_cosine_similarity'].isna().sum() / len(new_features['avg_cosine_similarity'])

0.6665827984595636

### All Users similarity

In [None]:
def calculate_all_users_similarity(comments_df):
    grouped_comments = comments_df.groupby('username')['body'].apply(lambda x: ' '.join(x)).reset_index()
    all_users_similarities = {}

    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(grouped_comments['body'])

    for i, row in tqdm(grouped_comments.iterrows(), desc="Processing cosine similarities of comments", total=len(grouped_comments)):
        username = row['username']
        user_vector = tfidf_matrix[i]

        if is_weird_comment([row['body']]):
            avg_cosine_similarity = 1.0
        else:
            cosine_sim_matrix = cosine_similarity(user_vector, tfidf_matrix)
            avg_cosine_similarity = (cosine_sim_matrix.sum() - 1) / (len(grouped_comments) - 1)

        all_users_similarities[username] = avg_cosine_similarity

    return all_users_similarities

### AVG Length

In [None]:
def add_comment_length_metrics(df, users):
    new_df = pd.DataFrame(users.copy())
    grouped_comments = df.groupby('username')['body'].apply(list)
    
    avg_lengths = {}
    max_lengths = {}
    min_lengths = {}

    for username, comments in tqdm(grouped_comments.items(), desc="Processing comments length", total=len(grouped_comments)):
        comment_lengths = [len(comment) for comment in comments]
        avg_lengths[username] = sum(comment_lengths) / len(comment_lengths)
        max_lengths[username] = max(comment_lengths)
        min_lengths[username] = min(comment_lengths)

    new_df['avg_comment_length'] = df['username'].map(avg_lengths)
    new_df['max_comment_length'] = df['username'].map(max_lengths)
    new_df['min_comment_length'] = df['username'].map(min_lengths)

    return new_df

In [None]:
comments_length_df = add_comment_length_metrics(comments_df, users_df['username'])

Processing comments length: 100%|██████████| 180960/180960 [00:00<00:00, 322005.98it/s]


In [None]:
new_features = new_features.merge(comments_length_df, on='username', how='left')
display(new_features.head())

Unnamed: 0,username,avg_comment_length,max_comment_length,min_comment_length
0,yakfsh1,382.0,382.0,382.0
1,Tsquare43,62.0,62.0,62.0
2,SeaSpeakToMe,75.0,81.0,67.0
3,hurtingxliving,39.0,39.0,39.0
4,Minute-Photo6916,15.0,15.0,15.0


### Comment to post ratio

In [10]:
def add_comment_post_ratio(df, comments_df, posts_df):
    comments_per_user = comments_df.groupby('username').size().reset_index(name='num_comments')
    posts_per_user = posts_df.groupby('username').size().reset_index(name='num_posts')
    user_stats = comments_per_user.merge(posts_per_user, on='username', how='outer').fillna(0)
    user_stats['comment_post_ratio'] = user_stats.apply(
        lambda row: 0 if row['num_comments'] == 0 else (1 if row['num_posts'] == 0 else row['num_comments'] / row['num_posts']),
        axis=1
    )
    df = df.merge(user_stats[['username', 'comment_post_ratio']], on='username', how='left')
    
    return df

new_features = add_comment_post_ratio(new_features, comments_df, posts_df)

In [11]:
new_features.head()

Unnamed: 0,username,comment_post_ratio_x,comment_post_ratio_y
0,yakfsh1,0.0,1.0
1,Tsquare43,0.0,1.0
2,SeaSpeakToMe,0.0,1.0
3,hurtingxliving,0.0,1.0
4,Minute-Photo6916,0.0,1.0


### Thread Average Depth

In [8]:
def add_average_thread_depth(df, comments_df):
    comments_df_copy = comments_df.copy()
    parent_lookup = dict(zip(comments_df_copy['id'], comments_df_copy['parent_id']))

    def calculate_depth(comment_id, parent_lookup):
        depth = 0

        while comment_id in parent_lookup and parent_lookup[comment_id].startswith("t1_"):
            parent_id = parent_lookup[comment_id]
            comment_id = parent_id
            depth += 1
        return depth

    comments_df_copy['depth'] = comments_df_copy['id'].apply(lambda x: calculate_depth(x, parent_lookup))
    avg_depth_per_user = comments_df_copy.groupby('username')['depth'].mean().reset_index(name='avg_thread_depth')
    df = df.merge(avg_depth_per_user, on='username', how='left')
    
    return df


In [9]:
new_features = add_average_thread_depth(new_features, comments_df)

display(new_features.head())

Unnamed: 0,username,avg_thread_depth
0,yakfsh1,0.5
1,Tsquare43,0.666667
2,SeaSpeakToMe,1.0
3,hurtingxliving,0.0
4,Minute-Photo6916,1.0


### Parent-child similarity

In [None]:
def add_average_similarity(df, comments_df):
    comments_df_copy = comments_df.copy()
    parent_lookup = dict(zip(comments_df_copy['id'], comments_df_copy['parent_id']))
    comment_vectors = dict(zip(comments_df_copy['id'], comments_df_copy['vector']))

    def calculate_highest_similarity(comment_id, parent_lookup, comment_vectors):
        max_similarity = 0
        current_id = comment_id
        
        while current_id in parent_lookup and parent_lookup[current_id].startswith("t1_"):
            parent_id = parent_lookup[current_id]
            if parent_id in comment_vectors:
                similarity = cosine_similarity(comment_vectors[comment_id], comment_vectors[parent_id])[0, 0]
                max_similarity = max(max_similarity, similarity)
            current_id = parent_id
        
        return max_similarity

    comments_df_copy['similarity'] = comments_df_copy['id'].apply(
        lambda x: calculate_highest_similarity(x, parent_lookup, comment_vectors)
    )

    user_similarity = comments_df_copy.groupby('username')['similarity'].mean().reset_index()
    user_similarity.columns = ['username', 'average_similarity']
    df = df.merge(user_similarity, on='username', how='left')
    
    return df


In [15]:
new_features = add_average_similarity(new_features, comments_df)
display(new_features.head())

Unnamed: 0,username,average_similarity
0,yakfsh1,0.0
1,Tsquare43,0.0
2,SeaSpeakToMe,0.0
3,hurtingxliving,0.0
4,Minute-Photo6916,0.0


### Type-Token Ratio - This is a basic measure of lexical diversity.

In [18]:
def calculate_ttr(text):
    tokens = word_tokenize(text)
    num_tokens = len(tokens)
    num_types = len(set(tokens))
    if num_tokens == 0:
        return 0
    return num_types / num_tokens

def add_average_ttr(df, comments_df):
    comments_df['ttr'] = comments_df['cleaned_body'].apply(calculate_ttr)
    avg_ttr_per_user = comments_df.groupby('username')['ttr'].mean().reset_index(name='avg_ttr')
    df = df.merge(avg_ttr_per_user, on='username', how='left')
    
    return df

In [19]:
new_features = add_average_ttr(new_features, comments_df)

display(new_features.head())

Unnamed: 0,username,average_similarity,avg_ttr
0,yakfsh1,0.0,0.931034
1,Tsquare43,0.0,0.927318
2,SeaSpeakToMe,0.0,1.0
3,hurtingxliving,0.0,0.888889
4,Minute-Photo6916,0.0,1.0


### Language Readability

In [22]:
def calculate_flesch_kincaid_grade(text):
    return textstat.flesch_kincaid_grade(text)

def add_average_flesch_kincaid_grade(df, comments_df):
    comments_df['flesch_kincaid_grade'] = comments_df['cleaned_body'].apply(calculate_flesch_kincaid_grade)
    avg_grade_per_user = comments_df.groupby('username')['flesch_kincaid_grade'].mean().reset_index(name='avg_flesch_kincaid_grade')
    df = df.merge(avg_grade_per_user, on='username', how='left')
    
    return df


In [23]:
new_features = add_average_flesch_kincaid_grade(new_features, comments_df)

display(new_features.head())

Unnamed: 0,username,average_similarity,avg_ttr,avg_flesch_kincaid_grade
0,yakfsh1,0.0,0.931034,12.1
1,Tsquare43,0.0,0.927318,6.9
2,SeaSpeakToMe,0.0,1.0,1.3
3,hurtingxliving,0.0,0.888889,12.6
4,Minute-Photo6916,0.0,1.0,-2.3


### Overlapping N-grams

In [31]:
def get_ngrams(text, n=2):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    analyzer = vectorizer.build_analyzer()
    return set(analyzer(text))

def calculate_overlap(comments, n=2):
    if len(comments) < 2:
        return 0.0
    
    all_ngrams = [get_ngrams(comment, n) for comment in comments]
    overlap_count = 0
    total_count = 0
    
    for i in range(len(all_ngrams)):
        for j in range(i + 1, len(all_ngrams)):
            overlap_count += len(all_ngrams[i].intersection(all_ngrams[j]))
            total_count += len(all_ngrams[i].union(all_ngrams[j]))
    
    if total_count == 0:
        return 0.0
    
    return overlap_count / total_count

def add_ngram_overlap(df, comments_df, n=2):
    grouped_comments = comments_df.groupby('username')['cleaned_body'].apply(list)
    
    overlap_ratios = {}
    
    for username, comments in tqdm(grouped_comments.items(), desc="Calculating n-gram overlap", total=len(grouped_comments)):
        overlap_ratios[username] = calculate_overlap(comments, n)
    
    overlap_df = pd.DataFrame(list(overlap_ratios.items()), columns=['username', 'ngram_overlap'])
    
    df = df.merge(overlap_df, on='username', how='left')
    
    return df

In [32]:
new_features = add_ngram_overlap(new_features, comments_df, n=2)

display(new_features.head())

Calculating n-gram overlap: 100%|██████████| 180960/180960 [00:35<00:00, 5116.68it/s] 


Unnamed: 0,username,average_similarity,avg_ttr,avg_flesch_kincaid_grade,avg_cosine_similarity,ngram_overlap
0,yakfsh1,0.0,0.931034,12.1,1.0,0.0
1,Tsquare43,0.0,0.927318,6.9,1.0,0.0
2,SeaSpeakToMe,0.0,1.0,1.3,,0.0
3,hurtingxliving,0.0,0.888889,12.6,,0.0
4,Minute-Photo6916,0.0,1.0,-2.3,,0.0
