# Imports

In [None]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import ast

In [None]:
import sys
ROOT = '../'
sys.path.append(ROOT)  # Add the root folder to the sys.path

# Import the modules
from config import *
from models.fasttext import fasttext_lang_model

# Reload the configuration
from importlib import reload
reload(sys.modules['config'])
reload(sys.modules['models.fasttext'])

# Import the reloaded modules
from config import *
from models.fasttext import fasttext_lang_model

# Submissions

## Merge submissions chunks

In [None]:
# Load all the cleaned submissions datasets and concatenate them into one large dataset
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions')
SUBMISSIONS_CHUNKS_DIR = os.path.join(SUBMISSIONS_DIR, 'chunks')

# Set the chunk size (number of rows to read at a time)
chunk_size = 1000000 # rows at a time 

In [None]:
## 1. Find the rows that contain only urls 
def find_rows_with_only_urls(chunk):
    rows_to_remove = chunk[(chunk['text'].str.startswith('http')) | (chunk['title'].str.startswith('http'))]
    print(f'Number of rows that contain only urls: {len(rows_to_remove)}')

    # Remove from the dataset the rows in 'rows_to_remove'
    chunk = chunk.drop(rows_to_remove.index)

    # Reset index
    chunk.reset_index(drop=True, inplace=True)
    return chunk

In [None]:
# Create a file where to append chunks
# Open the first chunk and save the header
chunk_path = os.path.join(SUBMISSIONS_CHUNKS_DIR, os.listdir(SUBMISSIONS_CHUNKS_DIR)[0])
submissions = pd.read_csv(chunk_path, nrows=0)
submissions.to_parquet(os.path.join(SUBMISSIONS_DIR, 'submissions.parquet'), index=False)

In [None]:
# Append all the chunks to the file
count_number_of_rows = 0
for f in tqdm(os.listdir(SUBMISSIONS_CHUNKS_DIR)):
    chunk_path = os.path.join(SUBMISSIONS_CHUNKS_DIR, f)
    for chunk in pd.read_csv(chunk_path, chunksize=chunk_size):
        if not chunk.empty:
            chunk = find_rows_with_only_urls(chunk)
            
            # Save the chunk to the file
            chunk.to_parquet(os.path.join(SUBMISSIONS_DIR, 'submissions.parquet'), mode='a', header=False, index=False)

            # Count the number of rows
            count_number_of_rows += len(chunk)

            # Clean the memory
            del chunk

# Free memory
del submissions 

print(f'Number of rows in the final dataset: {count_number_of_rows}')

## Filter submissions by language

In [None]:
SUBMISSIONS_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)

submissions = pd.read_parquet(dataset_path)
submissions.head()

In [None]:
# Load the fasttext model
fasttext_model = fasttext_lang_model()

# Create a new dataset that have the same column as the original dataset but only the rows where the language is 'en'
submissions_en = []

# For each row in the dataset, predict the language of the title, if the language is not English, predict the language of the text
# Select only the rows where the language is 'en'
for i, row in tqdm(submissions.iterrows(), total=len(submissions)):    
    # Remove "\n" from the text
    if(fasttext_model.predict_lang(row['title'].replace("\n", " ")) != 'en'):
        if(fasttext_model.predict_lang(row['text'].replace("\n", " ")) == 'en'):
            submissions_en.append(row)
    else:
        submissions_en.append(row)
        
print("Generating the DataFrame...")

# Create a DataFrame from the list
submissions_en = pd.DataFrame(submissions_en)

In [None]:
submissions_en

In [None]:
# Save the datasets
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions_en.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)
submissions_en.to_parquet(dataset_path, index=False)

## Check data

In [None]:
SUBMISSIONS_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)

submissions = pd.read_parquet(dataset_path)
submissions

In [None]:
SUBMISSIONS_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions_en.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)

submissions_en = pd.read_parquet(dataset_path)
submissions_en

In [None]:
# Show the rows that contains 'comments' as a text in the 'code' column
rows_to_fix = submissions[submissions['code'].str.contains('comments', case=False)]
rows_to_fix_en = submissions_en[submissions_en['code'].str.contains('comments', case=False)]

print(f'Number of rows that contain "comments" in the "code" column: {len(rows_to_fix)}')
print(f'Number of rows that contain "comments" in the "code" column (en): {len(rows_to_fix_en)}')

In [None]:
rows_to_fix

In [None]:
rows_to_fix_en

In [None]:
# Example: 
# - Link: https://www.reddit.com/r/paos_comments/comments/3c0p67/comment_id_csr0spc_posted_at_20150703_103921/
# - Code: 3c0p67
# From 'link' column, extract the 'code' and replace the 'code' column with the extracted code
rows_to_fix['code'] = rows_to_fix['link'].str.extract(r'comments/comments/(\w+)/')
rows_to_fix_en['code'] = rows_to_fix_en['link'].str.extract(r'comments/comments/(\w+)/')


In [None]:
rows_to_fix

In [None]:
rows_to_fix_en

In [None]:
# Sobstitute the rows in the original dataset with the fixed rows
for i, row in rows_to_fix.iterrows():
    submissions.loc[i] = row

for i, row in rows_to_fix_en.iterrows():
    submissions_en.loc[i] = row

# Re-check the rows that contain 'comments' as a text in the 'code' column
rows_to_fix = submissions[submissions['code'].str.contains('comments', case=False)]
rows_to_fix_en = submissions_en[submissions_en['code'].str.contains('comments', case=False)]

print(f'Number of rows that contain "comments" in the "code" column: {len(rows_to_fix)}')
print(f'Number of rows that contain "comments" in the "code" column (en): {len(rows_to_fix_en)}')


In [None]:
# Save the datasets
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)
submissions.to_parquet(dataset_path, index=False)

In [None]:
# Save the datasets
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions_en.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)
submissions_en.to_parquet(dataset_path, index=False)

# Comments

## Merge comments chunks

In [None]:
# Load submissions dataset
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = 'submissions_en.parquet'
submissions = pd.read_parquet(os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME))

In [None]:
# Load all the cleaned comments datasets and concatenate them into one large dataset
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_CHUNKS_DIR = os.path.join(COMMENTS_DIR, 'chunks/')

# Set the chunk size (number of rows to read at a time)
chunk_size = 1000000 # rows at a time 

In [None]:
## 1. Clean 'text' column by removing the rows that contains only urls as a text
def find_rows_with_only_urls(chunk):
       rows_to_remove = chunk[(chunk['body'].str.startswith('http'))]

       # Remove from the dataset the rows in 'rows_to_remove'
       chunk = chunk.drop(rows_to_remove.index)

       # Reset index
       chunk.reset_index(drop=True, inplace=True)
       
       return chunk

In [None]:
## 2. Clean the rows that contains: author, score, created, link, body, code as a text
def find_rows_with_author_score_created_link_body_code(chunk):
    rows_to_remove = chunk[(chunk['author'].str.contains('author')) | 
                           (chunk['score'].str.contains('score')) | 
                           (chunk['created'].str.contains('created')) | 
                           (chunk['link'].str.contains('link')) | 
                           (chunk['body'].str.contains('body')) | 
                           (chunk['code'].str.contains('code'))]
                           
    print(f'Number of rows that contain: author, score, created, link, body, code as a text: {len(rows_to_remove)}')

    # Remove from the dataset the rows in 'rows_to_remove'
    chunk = chunk.drop(rows_to_remove.index)

    # Reset index
    chunk.reset_index(drop=True, inplace=True)

    return chunk

In [None]:
# Create a file where to append chunks
# Open the first chunk and save the header
chunk_path = os.path.join(COMMENTS_CHUNKS_DIR, os.listdir(COMMENTS_CHUNKS_DIR)[0])
comments = pd.read_csv(chunk_path, nrows=0)
# comments.to_csv(os.path.join(COMMENTS_DIR, 'comments.csv'), index=False)
comments.to_parquet(os.path.join(COMMENTS_DIR, 'comments.parquet'), index=False)

In [None]:
# Append all the chunks to the file
count_number_of_rows = 0
for f in tqdm(os.listdir(COMMENTS_CHUNKS_DIR)):
    chunk_path = os.path.join(COMMENTS_CHUNKS_DIR, f)
    for chunk in pd.read_csv(chunk_path, chunksize=chunk_size):
        if not chunk.empty:
            chunk = find_rows_with_only_urls(chunk)
            chunk = find_rows_with_author_score_created_link_body_code(chunk)

            # Save the chunk to the file
            # chunk.to_csv(os.path.join(COMMENTS_DIR, 'comments.csv'), mode='a', header=False, index=False)
            chunk.to_parquet(os.path.join(COMMENTS_DIR, 'comments.parquet'), mode='a', header=False, index=False)

            # Count the number of rows
            count_number_of_rows += len(chunk)

            # Clean the memory
            del chunk

# Free memory
del submissions

print(f'Number of rows in the final dataset: {count_number_of_rows}')

## Filter comments by language

In [None]:
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = 'comments.parquet'
dataset_path = os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME)

comments = pd.read_parquet(dataset_path)
comments

In [None]:
# Load the fasttext model
fasttext_model = fasttext_lang_model()

# Create a new dataset that have the same column as the original dataset but only the rows where the language is 'en'
comments_en = []

# For each row in the dataset, predict the language of the title, if the language is not English, predict the language of the text
# Select only the rows where the language is 'en'
for i, row in tqdm(comments.iterrows(), total=len(comments)):    
    # Remove "\n" from the text
    if(fasttext_model.predict_lang(row['body'].replace("\n", " ")) == 'en'):
        comments_en.append(row)
        
print("Generating the DataFrame...")

# Create a DataFrame from the list
comments_en = pd.DataFrame(comments_en)

In [None]:
comments_en

In [None]:
# Save the datasets
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = "comments_en.parquet"
dataset_path = os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME)
comments_en.to_parquet(dataset_path, index=False)

## Check data

### Comments

In [None]:
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = 'comments.parquet'
dataset_path = os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME)

comments = pd.read_parquet(dataset_path)
comments

In [None]:
# Show the rows that contains 'comments' as a text in the 'code' column
rows_to_fix = comments[comments['code'].str.contains('comments', case=False)]
print(f'Number of rows that contain "comments" in the "code" column: {len(rows_to_fix)}')

In [None]:
rows_to_fix.iloc[0]['link']

In [None]:
# Example: 
# - Link: https://www.reddit.com/r/paos_comments/comments/3c0p67/comment_id_csr0spc_posted_at_20150703_103921/
# - Code: 3c0p67
# From 'link' column, extract the 'code' and replace the 'code' column with the extracted code
rows_to_fix['code'] = rows_to_fix['link'].str.extract(r'comments/comments/(\w+)/')

In [None]:
rows_to_fix

In [None]:
# Ensure that the indices in rows_to_fix align with those in comments
indices = rows_to_fix.index

# Update the comments DataFrame using the loc method with all indices at once
comments.loc[indices, :] = rows_to_fix.values

# Re-check the rows that contain 'comments' as a text in the 'code' column
rows_to_fix = comments[comments['code'].str.contains('comments', case=False)]

print(f'Number of rows that contain "comments" in the "code" column: {len(rows_to_fix)}')

In [None]:
# Load submissions dataset
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = 'submissions_en.parquet'
submissions = pd.read_parquet(os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME))

print(f'Number of rows in the comments dataset: {len(comments)}')

# Re-check if the code contained in 'code' column is contained in the 'code' column of the submissions dataset
comments = comments[comments['code'].isin(submissions['code'])]

print(f'Number of rows in the comments dataset after removing the rows that are not in the submissions dataset: {len(comments)}')

In [None]:
# Save the datasets
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = "comments.parquet"
dataset_path = os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME)
comments.to_parquet(dataset_path, index=False)

### Comments_en

In [None]:
# Load comments dataset
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = 'comments_en.parquet'
comments_en = pd.read_parquet(os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME))
comments_en

In [None]:
# Show the rows that contains 'comments' as a text in the 'code' column
rows_to_fix = comments_en[comments_en['code'].str.contains('comments', case=False)]
print(f'Number of rows that contain "comments" in the "code" column: {len(rows_to_fix)}')

In [None]:
rows_to_fix.iloc[0]['link']

In [None]:
# Example: 
# - Link: https://www.reddit.com/r/paos_comments/comments/3c0p67/comment_id_csr0spc_posted_at_20150703_103921/
# - Code: 3c0p67
# From 'link' column, extract the 'code' and replace the 'code' column with the extracted code
rows_to_fix['code'] = rows_to_fix['link'].str.extract(r'comments/comments/(\w+)/')

In [None]:
rows_to_fix

In [None]:
# Ensure that the indices in rows_to_fix align with those in comments_en
indices = rows_to_fix.index

# Update the comments_en DataFrame using the loc method with all indices at once
comments_en.loc[indices, :] = rows_to_fix.values

# Re-check the rows that contain 'comments' as a text in the 'code' column
rows_to_fix = comments_en[comments_en['code'].str.contains('comments', case=False)]

print(f'Number of rows that contain "comments" in the "code" column: {len(rows_to_fix)}')

In [None]:
# Load submissions dataset
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = 'submissions_en.parquet'
submissions = pd.read_parquet(os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME))

print(f'Number of rows in the comments_en dataset: {len(comments_en)}')

# Re-check if the code contained in 'code' column is contained in the 'code' column of the submissions dataset
comments_en = comments_en[comments_en['code'].isin(submissions['code'])]

print(f'Number of rows in the comments_en dataset after removing the rows that are not in the submissions dataset: {len(comments_en)}')

In [None]:
# Save the datasets
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = "comments_en.parquet"
dataset_path = os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME)
comments_en.to_parquet(dataset_path, index=False)

# Merge data

In [None]:
SUBMISSIONS_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit/submissions/')
SUBMISSIONS_DATASET_NAME = "submissions_en.parquet"
dataset_path = os.path.join(SUBMISSIONS_DIR, SUBMISSIONS_DATASET_NAME)

submissions_en = pd.read_parquet(dataset_path)
submissions_en

In [None]:
COMMENTS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/comments/')
COMMENTS_DATASET_NAME = 'comments_en.parquet'
dataset_path = os.path.join(COMMENTS_DIR, COMMENTS_DATASET_NAME)

comments_en = pd.read_parquet(dataset_path)
comments_en

## Select submissions that have at least x comments

In [None]:
# Count the number of comments for each submission
# Create a dataset with the number of comments for each submission (code)
comments_count = comments_en['code'].value_counts().reset_index()
comments_count.columns = ['code', 'comments_count']
print(f'Number of   submissions that have comments: {len(comments_count)}')

# Select the submissions that have x or more comments
MIN_COMMENTS = 10
submissions_codes = comments_count[comments_count['comments_count'] >= MIN_COMMENTS]
print(f'Number of submissions with {MIN_COMMENTS} or more comments: {submissions_codes.shape[0]}')
submissions_codes

In [None]:
# From submissions_en, select only the rows where the 'code' column is in submissions_codes
submissions_with_at_least_x_comments = submissions_en[submissions_en['code'].isin(submissions_codes['code'])].reset_index(drop=True)
submissions_with_at_least_x_comments

## Selects submissions that have a score between x and y

In [None]:
def plot_score_distribution(score_list, min_score, max_score):
    plot_list = [(min_score, 10), (10, 100), (100, 1000), (1000, 10000), (10000, max_score)]
    for i, (min_score, max_score) in enumerate(plot_list):
        tot_num_submissions = len([score for score in score_list if min_score <= score <= max_score])
        plt.figure(figsize=(20, 6))  # Adjust the width and height as desired
        plt.hist(score_list, bins=10000)
        plt.xlabel('Score')
        plt.ylabel('Frequency')
        plt.title(f'Distribution of Submissions Score [{min_score}, {max_score}] -> {tot_num_submissions} submissions')
        plt.xlim([min_score, max_score])
        plt.yscale('log')
        plt.show()

In [None]:
# Turn all the scores into integers
submissions_score_list = submissions_with_at_least_x_comments['score'].tolist()
submissions_score_list = [int(score) for score in submissions_score_list if type(score) == int or score.isdigit()]
print(f"Min score: {min(submissions_score_list)}, Max score: {max(submissions_score_list)}")

In [None]:
plot_score_distribution(submissions_score_list, min(submissions_score_list), max(submissions_score_list))

In [None]:
# Select the submissions that have a score from x to y
MIN_SCORE = 10
MAX_SCORE = max(submissions_score_list)

# Convert the scores to integers
submissions_with_at_least_x_comments['score'] = pd.to_numeric(submissions_with_at_least_x_comments['score'], errors='coerce').astype('Int64')

# Select the submissions that have a score between min_score and max_score
submissions_between_scores = submissions_with_at_least_x_comments[
    (submissions_with_at_least_x_comments['score'] >= MIN_SCORE) & 
    (submissions_with_at_least_x_comments['score'] <= MAX_SCORE)
    ].reset_index(drop=True)
print(f'Number of submissions with a score between {MIN_SCORE} and {MAX_SCORE}: {len(submissions_between_scores)}')
submissions_between_scores

## Select comments that belong to the filtered submissions

In [None]:
filtered_comments = comments_en[comments_en['code'].isin(submissions_between_scores['code'])].reset_index(drop=True)
filtered_comments

## Select comments that have a score between x and y

In [None]:
# Convert score to integer
filtered_comments['score'] = pd.to_numeric(filtered_comments['score'], errors='coerce').astype('Int64')

# Convert NAType to 0
filtered_comments['score'] = filtered_comments['score'].fillna(0)

In [None]:
# Turn all the scores into integers
comments_score_list = filtered_comments['score'].tolist()
comments_score_list = [int(score) for score in comments_score_list if type(score) == int or score.isdigit()]
print(f"Min score: {min(comments_score_list)}, Max score: {max(comments_score_list)}")

In [None]:
plot_score_distribution(comments_score_list, min(comments_score_list), max(comments_score_list))

In [None]:
# Select the comments that have a score from x to y
MIN_SCORE = 10
MAX_SCORE = max(comments_score_list)

# Select the submissions that have a score between min_score and max_score
comments_between_scores = filtered_comments[
    (filtered_comments['score'] >= MIN_SCORE) & 
    (filtered_comments['score'] <= MAX_SCORE)
    ].reset_index(drop=True)
print(f'Number of comments with a score between {MIN_SCORE} and {MAX_SCORE}: {len(comments_between_scores)}')
comments_between_scores

In [None]:
# From submissions_between_scores, select only the rows that have the 'code' in comments_between_scores
submissions_filtered = submissions_between_scores[submissions_between_scores['code'].isin(comments_between_scores['code'])].reset_index(drop=True)
submissions_filtered

## Create a list of comments for each submission

In [None]:
# Add a new empty column "comments" to the submissions_between_scores DataFrame
submissions_filtered_with_comments_en = submissions_filtered.copy()
submissions_filtered_with_comments_en['comments'] = None

# For each row in the submissions_filtered_with_comments_en DataFrame, select the comments that have the same 'code' as the current 'code'
for i, row in tqdm(submissions_filtered_with_comments_en.iterrows(), total=len(submissions_filtered_with_comments_en)):
    code = row['code']
    # Select the comments that have the 'code' equal to the current 'code'
    comments = comments_between_scores[comments_between_scores['code'] == code]
    comments = comments.values.tolist()
    submissions_filtered_with_comments_en.at[i, 'comments'] = str(comments)
submissions_filtered_with_comments_en

In [None]:
# Save the dataset
SUBMISSIONS_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit/submissions/')
output_file = os.path.join(SUBMISSIONS_DIR, 'submissions_filtered_with_comments_en.parquet')
submissions_filtered_with_comments_en.to_parquet(output_file, index=False)

# Submissions filtered with comments

In [None]:
# Load submissions with comments dataset
SUBMISSIONS_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit/submissions/')
output_file = os.path.join(SUBMISSIONS_DIR, 'submissions_filtered_with_comments_en.parquet')
submissions_filtered_with_comments_en = pd.read_parquet(output_file)
submissions_filtered_with_comments_en

In [None]:
# Check nan values
print(submissions_filtered_with_comments_en.isnull().sum())

In [None]:
# Show the rows with nan values
submissions_filtered_with_comments_en[submissions_filtered_with_comments_en.isnull().any(axis=1)]

In [None]:
# Remove row
submissions_filtered_with_comments_en.dropna(inplace=True)

In [None]:
# Check nan values
print(submissions_filtered_with_comments_en.isnull().sum())

In [None]:
# Save the data to a CSV file
output_file = os.path.join(SUBMISSIONS_DIR, 'submissions_filtered_with_comments_en.parquet')
submissions_filtered_with_comments_en.to_parquet(output_file, index=False)

# Generate daily and hourly reddit dataset
 

In [None]:
# Load submissions with comments dataset
SUBMISSIONS_DIR = os.path.join(ROOT, SOCIAL_DATASET_PATH, 'reddit/submissions/')
output_file = os.path.join(SUBMISSIONS_DIR, 'submissions_filtered_with_comments_en.parquet')
submissions_filtered_with_comments_en = pd.read_parquet(output_file)
submissions_filtered_with_comments_en

In [None]:
# Retrieve the start and end date of the dataset
start_date = submissions_filtered_with_comments_en['created'].min().split()[0]
end_date = submissions_filtered_with_comments_en['created'].max().split()[0]

In [None]:
# Generate two temp dataset (daily and hourly) with just the timestamp column that start from START_DATE and end at END_DATE
# Daily dataset has the format: 2024-03-16
# Hourly dataset has the format: 2024-03-16 20:00:00

reddit_daily = pd.date_range(start=start_date, end=end_date, freq='D').to_frame(index=False, name='timestamp')
reddit_daily['timestamp'] = reddit_daily['timestamp'].dt.date

# For the hourly dataset, generate 2 coluns: timestamp_begin and timestamp_end where timestamp_end is timestamp_begin + 1 hour
# reddit_hourly = pd.date_range(start=START_DATE, end=END_DATE, freq='h').to_frame(index=False, name='timestamp')
# reddit_hourly['timestamp_begin'] = reddit_hourly['timestamp']
# reddit_hourly['timestamp_end'] = reddit_hourly['timestamp'] + pd.Timedelta(hours=1)
# reddit_hourly.drop(columns=['timestamp'], inplace=True)

# Turn the timestamp column into string
reddit_daily['timestamp'] = reddit_daily['timestamp'].astype(str)
# reddit_hourly['timestamp_begin'] = reddit_hourly['timestamp_begin'].astype(str)
# reddit_hourly['timestamp_end'] = reddit_hourly['timestamp_end'].astype(str)

# Add 'reddit' column
reddit_daily['reddit'] = None
# reddit_hourly['reddit'] = None

In [None]:
reddit_daily

In [None]:
# reddit_hourly

## Daily

In [None]:
# Generate reddit_daily
reddit_daily_copy = reddit_daily.copy()

# Iterate over the daily dataset
for index, row in tqdm(reddit_daily_copy.iterrows(), total=len(reddit_daily_copy)):
    # Get current timestamp
    curr_timestamp = row['timestamp']
    # Select the submissions that have been created during the curr_timestamp
    # Example: 
        # if curr_timestamp is 2018-01-01, then select the submission items that have been published from 2018-01-01 00:00:00 to 2018-01-01 23:59:59
        # submissions = [submission1, submission2, submission3]
    filtered_submissions = submissions_filtered_with_comments_en[submissions_filtered_with_comments_en['created'].str.contains(curr_timestamp)]

    # Check if there are any submissions
    if len(filtered_submissions) == 0:
        reddit_daily_copy.at[index, 'reddit'] = str([]) # Save an empty list
    else:
        updated_submissions = []
        # For each filtered submission, select all the comments that have been created during the curr_timestamp
        for i, submission in filtered_submissions.iterrows():
            filtered_comments = []
            # Select all the comments
            comments = ast.literal_eval(submission['comments'])

            # Check if there are any comments
            if len(comments) != 0:
                # Select the comments that have been created during the curr_timestamp, save them as a list
                # Example: 
                    # if curr_timestamp is 2018-01-01, then select the comments items that have been published from 2018-01-01 00:00:00 to 2018-01-01 23:59:59
                    # comments = [comment1, comment2, comment3]
                for comment in comments:
                    if comment[2].startswith(curr_timestamp):
                        filtered_comments.append(comment)
                if len(filtered_comments) == 0:
                    filtered_comments = str([])
            else:
                filtered_comments = str([])

            # Replace the comments list with the new comments list
            submission['comments'] = filtered_comments

            # Append the submission to the reddit list without columns
            updated_submissions.append(list(submission))

        # Append the news list to the reddit column
        reddit_daily_copy.at[index, 'reddit'] = updated_submissions
reddit_daily_copy

In [None]:
# Count the number of NOT empty rows (different from '[]') in the reddit column
not_empty_rows = reddit_daily_copy[reddit_daily_copy['reddit'] != '[]']
print(f'Number of rows that are not empty: {len(not_empty_rows)}')
not_empty_rows

In [None]:
# Show an example
row = not_empty_rows['reddit'][0] # [x] x: row
print(f"Row has {len(row)} submissions")
row

In [None]:
submission = row[25]
print(f"Submission has {len(submission[12])} comments")
submission[12]

In [None]:
# Count the number of empty rows (equal to '['empty']') in the reddit column
empty_rows = reddit_daily_copy[reddit_daily_copy['reddit'] == '[]']
print("Total number of '[]' occurrences in the reddit column:", empty_rows.shape[0])
empty_rows

In [None]:
# # Count the number of '['empty']' in the comment list of each reddit submissions
# empty_rows = []
# # Select the submissions that don't have '['empty']' in the reddit column
# submissions = reddit_daily_copy[reddit_daily_copy['reddit'].str.contains(r'\[\'empty\'\]') == False]

# # Iterate over the submissions
# for i, row in submissions.iterrows():
#     try:
#         submissions = ast.literal_eval(row["reddit"])
#         for j, s in enumerate(submissions): 
#             comments = ast.literal_eval(s[12])
#             if comments == "['empty']":
#                 empty_rows.append(s)
#     except:
#         print(f'Error at index {i}, {j}')
#         print(s)
#         print(comments)

# empty_rows = pd.DataFrame(empty_rows)
# print("Total number of submissions without any comments ('['empty']' occurrences in the comment list of each reddit submissions):", empty_rows.shape[0])
# empty_rows

In [None]:
# Set the timestamp as the index
reddit_daily_copy = reddit_daily_copy.set_index('timestamp', drop=False)
reddit_daily = reddit_daily_copy
reddit_daily

## Hourly

In [None]:
# # Generate reddit_hourly
# reddit_hourly_copy = reddit_hourly.copy()

# # Iterate over the hourly dataset
# for index, row in tqdm(reddit_hourly_copy.iterrows(), total=len(reddit_hourly_copy)):
#     # Get the timestamp
#     timestamp_begin = row['timestamp_begin']
#     timestamp_end = row['timestamp_end']
#     # Select the submission rows items that have been posted between timestamp_begin and timestamp_end, save them as a list
#     # Example: if timestamp_begin is 2018-01-01 00:00:00 and timestamp_end is 2018-01-01 01:00:00
#     # select the submission items that have been published from 2018-01-01 00:00:00 to 2018-01-01 00:59:59
#     # submissions = [submission1, submission2, submission3]
#     filtered_submissions = submissions_filtered_with_comments_en[
#         (submissions_filtered_with_comments_en['created'] >= timestamp_begin) & 
#         (submissions_filtered_with_comments_en['created'] < timestamp_end)
#     ]
    
#     # Check if there are any submissions
#     if len(filtered_submissions) == 0:
#         reddit_hourly_copy.at[index, 'reddit'] = str([])
#     else:
#         updated_submissions = []
#         # For each filtered submission, select all the comments that have been posted between timestamp_begin and timestamp_end
#         for i, submission in filtered_submissions.iterrows():
#             filtered_comments = []
#             # Select all the comments of the submission
#             comments = ast.literal_eval(submission['comments'])

#             # Check if there are any comments
#             if len(comments) != 0:
#                 # Select the comments that have been posted between timestamp_begin and timestamp_end, save them as a list
#                 # Example: if timestamp_begin is 2018-01-01 00:00:00 and timestamp_end is 2018-01-01 01:00:00
#                 # select the comments that have been published from 2018-01-01 00:00:00 to 2018-01-01 00:59:59
#                 # comments = [comment1, comment2, comment3]
#                 for comment in comments:
#                     if comment[2] >= timestamp_begin and comment[2] < timestamp_end:
#                         filtered_comments.append(comment)
#                 if len(filtered_comments) == 0:
#                     filtered_comments = str([])
#             else:
#                 filtered_comments = str([])

#             # Replace the comments list with the new comments list
#             submission['comments'] = filtered_comments

#             # Append the submission to the reddit list without columns
#             updated_submissions.append(list(submission))

#         # Append the news list to the reddit column
#         reddit_hourly_copy.at[index, 'reddit'] = updated_submissions
# reddit_hourly_copy

In [None]:
# # Count the number of NOT empty rows(different from '[]') in the cointredditelegraph column
# not_empty_rows = reddit_hourly_copy[reddit_hourly_copy['reddit'] != '[]']
# print("Total number of NOT '[]' occurrences in the reddit column:", not_empty_rows.shape[0])
# not_empty_rows

In [None]:
# # Show an example
# row = not_empty_rows['reddit'][5] # [x] x: row
# print(f"Row has {len(row)} submissions")
# row

In [None]:
# submission = row[0]
# print(f"Submission has {len(submission[12])} comments")
# submission[12]

In [None]:
# # Count the number of empty rows (equal to '[]') in the reddit column
# empty_rows = reddit_hourly_copy[reddit_hourly_copy['reddit'] == '[]']
# print("Total number of '[]' occurrences in the reddit column:", empty_rows.shape[0])
# empty_rows

In [None]:
# # Count the number of '['empty']' in the comment list of each reddit submissions
# empty_rows = []
# # Select the submissions that don't have '['empty']' in the reddit column
# submissions = reddit_hourly_copy[reddit_hourly_copy['reddit'].str.contains(r'\[\'empty\'\]') == False]

# # Iterate over the submissions
# for i, row in submissions.iterrows():
#     try:
#         submissions = ast.literal_eval(row["reddit"])
#         for j, s in enumerate(submissions): 
#             comments = ast.literal_eval(s[12])
#             if comments == "['empty']":
#                 empty_rows.append(s)
#     except:
#         print(f'Error at index {i}, {j}')
#         print(s)
#         print(comments)

# empty_rows = pd.DataFrame(empty_rows)
# print("Total number of submissions without any comments ('['empty']' occurrences in the comment list of each reddit submissions):", empty_rows.shape[0])
# empty_rows

In [None]:
# # Set the timestamp as the index
# reddit_hourly_copy = reddit_hourly_copy.set_index('timestamp_begin', drop=False)
# reddit_hourly = reddit_hourly_copy
# reddit_hourly

## Save datasets

In [None]:
# Save the datasets
REDDIT_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit')
reddit_daily.to_csv(os.path.join(REDDIT_DIR, "reddit_daily_grouped.csv"), index=False)

# Save the datasets
# REDDIT_DIR = os.path.join(ROOT ,SOCIAL_DATASET_PATH, 'reddit')
# reddit_hourly.to_csv(os.path.join(REDDIT_DIR, "reddit_hourly_grouped.csv"), index=False)