In [1]:
%load_ext autoreload
%autoreload 2

# Analysis Chico

## Import the data

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import json
from pathlib import Path


subs_of_interest = ['AmItheAsshole', 'confessions', 'tifu']

def import_subreddit(subreddit):
    location_name = Path().cwd() / 'datasets' / f'{subreddit}.csv'
    return pd.read_csv(location_name)

subreddit_df = pd.concat([import_subreddit(subreddit) for subreddit in subs_of_interest], ignore_index=True)

def open_json_sub(subreddit_name):
    path_to_sub = Path().cwd() / 'data' / f'{subreddit_name}.json'
    with open(path_to_sub, 'r') as f:
        subreddit_dict = json.load(f)
    return subreddit_dict

subreddit_dict = {subreddit: open_json_sub(subreddit) for subreddit in subs_of_interest}


In [15]:
from utils.analysis import *
from utils.chico_analysis import *

results = {}

for sub in subs_of_interest:    
    # posts = scraper.get_subreddit_posts(sub, limit=100, cache=True)
    # posts_df = create_posts_dataframe(subreddit_dict)
    
    tfidf_results = tfidf_df(subreddit_df[subreddit_df['subreddit'] == sub], include_selftext=True)
    # tfidf_results = tfidf_analyze_subreddit(posts)
    tf_idf_scores = get_mean_tfidf(
        tfidf_matrix=tfidf_results['tfidf_matrix'],
        feature_names=tfidf_results['feature_names'],
        return_df=True
    )
    results[sub] = {"posts_df":subreddit_df[subreddit_df['subreddit'] == sub],
                    "tfidf_results":tfidf_results,
                    "tf_idf_scores":tf_idf_scores}



In [16]:
# Extract the vocabulary for each subreddit
vocabularies = {sub: set(results[sub]['tfidf_results']['feature_names']) for sub in subs_of_interest}

# Get the intersection of the vocabularies
common_vocab = set.intersection(*vocabularies.values())

# Report analytics
print(f"Number of common terms across all subreddits: {len(common_vocab)}")
print(f"Common terms: {', '.join(list(common_vocab)[:10])}...")  # Display first 10 common terms

# Calculate Jaccard similarity for each pair of subreddits
for sub1 in subs_of_interest:
    for sub2 in subs_of_interest:
        if sub1 > sub2:
            intersection = vocabularies[sub1].intersection(vocabularies[sub2])
            union = vocabularies[sub1].union(vocabularies[sub2])
            jaccard_similarity = len(intersection) / len(union)
            print(f"Jaccard similarity between {sub1} and {sub2}: {jaccard_similarity:.3f}")

# Report unique terms for each subreddit
for sub in subs_of_interest:
    unique_terms = vocabularies[sub] - common_vocab
    print(f"Number of unique terms in {sub}: {len(unique_terms)}")
    print(f"Unique terms in {sub}: {', '.join(list(unique_terms)[:10])}...")  # Display first 10 unique terms

Number of common terms across all subreddits: 637
Common terms: anyway, might, shower, experience, pay, possible, different, due, possibly, proceed...
Jaccard similarity between confessions and AmItheAsshole: 0.553
Jaccard similarity between tifu and AmItheAsshole: 0.592
Jaccard similarity between tifu and confessions: 0.572
Number of unique terms in AmItheAsshole: 363
Unique terms in AmItheAsshole: spending, grandparent, household, intention, express, lily, shock, complain, stuck, fiancé...
Number of unique terms in confessions: 363
Unique terms in confessions: protect, express, fear, poor, manipulate, stuck, moan, mentally, ocd, shes...
Number of unique terms in tifu: 363
Unique terms in tifu: assignment, shock, poor, tifu, complain, texted, awake, teeth, delete, coworker...


In [14]:
common_vocab

{'anyway',
 'might',
 'shower',
 'pay',
 'experience',
 'possible',
 'express',
 'different',
 'due',
 'fear',
 'poor',
 'possibly',
 'proceed',
 'shock',
 'take',
 'tifu',
 'complain',
 'stuck',
 'paint',
 'fiancé',
 'mentally',
 'texted',
 'side',
 'hot',
 'group',
 'reddit',
 'act',
 'celebrate',
 'cool',
 'money',
 'know',
 'worse',
 'coworker',
 'asshole',
 'choice',
 'four',
 'mention',
 'alcohol',
 'usually',
 'smoke',
 'bit',
 'hole',
 'point',
 'suggest',
 'sister',
 'whatever',
 'read',
 'date',
 'dead',
 'opinion',
 'arm',
 'figure',
 'bike',
 'stuff',
 'however',
 'type',
 'always',
 'weekend',
 'sound',
 'sometimes',
 'let',
 'day',
 'car',
 'first',
 'also',
 'brush',
 'girlfriend',
 'title',
 'asleep',
 'christmas',
 'head',
 'along',
 'cheri',
 'result',
 'main',
 'tear',
 'confront',
 'flight',
 'heat',
 'mood',
 'hand',
 'thats',
 'hurt',
 'song',
 'chance',
 'three',
 'air',
 'feel',
 'place',
 'dinner',
 'exactly',
 'friday',
 'may',
 'halloween',
 'want',
 'choose'