In [634]:
#README FIRST: Data Cleaning, Munging and EDA File for Project Three
import os
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_rows', 5) #Ensure this is here to avoiding elongating the file

### Loading in All Data

In [635]:
# Directory path
directory_path = '../data'

# Create an empty list to store DataFrames
dfs_list = []

# Loop through list
for i in range(1, 53):
    # Construct the file path based on the pattern
    file_path = os.path.join(directory_path, f'df_d{i}.csv')
    
    # Check if the file exists before attempting to read it
    if os.path.exists(file_path):
        # Read CSV into DataFrame
        df = pd.read_csv(file_path)
        # Append DataFrame to the list
        dfs_list.append(df)

# Concatenate DataFrames in the list
df_dem = pd.concat(dfs_list).reset_index(drop=True)

# Create an empty list to store DataFrames
dfs_list_rep = []

# Loop through list
for i in range(1, 53):
    # Construct the file path based on the pattern
    file_path_r = os.path.join(directory_path, f'df_r{i}.csv')
    
    # Check if the file exists before attempting to read it
    if os.path.exists(file_path_r):
        # Read CSV into DataFrame
        df_rep = pd.read_csv(file_path_r)
        # Append DataFrame to the list
        dfs_list_rep.append(df_rep)

# Concatenate DataFrames in the list
df_rep = pd.concat(dfs_list_rep).reset_index(drop=True)

#### NOTE:  The drop_duplicates() method removes duplicate rows. Use the subset parameter if only some specified columns should be considered when looking for duplicates.


### Drop duplicates

In [636]:
df_dem = df_dem.drop_duplicates(inplace=False, ignore_index=True)

In [637]:
df_rep = df_rep.drop_duplicates(inplace=False, ignore_index=True)

### Drop Superfluous Columns

In [638]:
columns_to_drop = ['selftext', 'created_utc','name','0']

# for df_dem
df_dem = df_dem.drop(columns=columns_to_drop)

# for df_rep
df_rep = df_rep.drop(columns=columns_to_drop)

### Visual inspection

In [639]:
df_dem

Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,democrats,
1,Biden's campaign pushes abortion rights in bat...,democrats,
...,...,...,...
2783,WHAT JOE BIDEN HAS DONE,democrats,
2784,Calls for southern states to mobilize troops t...,democrats,


In [640]:
df_rep

Unnamed: 0,title,subreddit
0,Texas Refuses To Comply With Federal Governmen...,Republican
1,"Florida Bans State, Federal Funding For DEI At...",Republican
...,...,...
2635,Pro Palestinian radical caught mid tirade verb...,Republican
2636,Texas AG is “saying the Department of Homeland...,Republican


### Combining Dataframes 

In [641]:
df_both = pd.concat([df_dem, df_rep]).reset_index(drop=True)
df_both

Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,democrats,
1,Biden's campaign pushes abortion rights in bat...,democrats,
...,...,...,...
5420,Pro Palestinian radical caught mid tirade verb...,Republican,
5421,Texas AG is “saying the Department of Homeland...,Republican,


### Dummifying the `subreddit` Column

In [642]:
# Democrats, denoted 1, are taken to be the positive class 

df_both['subreddit'] = np.where(df_both['subreddit'] == 'democrats', 1, 0)

In [643]:
# pd.set_option('display.max_rows', None)
df_both

Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,1,
1,Biden's campaign pushes abortion rights in bat...,1,
...,...,...,...
5420,Pro Palestinian radical caught mid tirade verb...,0,
5421,Texas AG is “saying the Department of Homeland...,0,


In [644]:
# Used for moving comments to titles

#### The following steps are to remove comments from these dataframes after a few previous attempts at including them within the data gathering process. 

In [645]:
# Step 1: Create a mask for rows where both 'title' and 'comments' are occupied
mask = (pd.notna(df_both['title'])) & (pd.notna(df_both['comments']))

# Step 2: Replace 'title' with 'comments' for the selected rows
df_both.loc[mask, 'title'] = df_both.loc[mask, 'comments']

# Step 3: Optional - Empty the 'comments' column
df_both['comments'] = ''

# Display the DataFrame
df_both


Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,1,
1,Biden's campaign pushes abortion rights in bat...,1,
...,...,...,...
5420,Pro Palestinian radical caught mid tirade verb...,0,
5421,Texas AG is “saying the Department of Homeland...,0,


### Must drop the moderator autogenerated rows.

In [646]:
# Create a mask to identify rows with the moderator text blanking the comment
mask = df_both['title'].str.contains("This is a place for Republicans to discuss issues with other Republicans.")

# Replace NaN values in the mask with False
mask = mask.fillna(False)

# Use the boolean mask to select rows that meet the condition
df_filtered = df_both[~mask]

# Drop rows with NaN values in the 'title' column
df_filtered.dropna(subset=['title'], inplace=True)

# Reset index if needed
df_filtered.reset_index(drop=True, inplace=True)

In [647]:
# Display the resulting DataFrame
df_filtered

Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,1,
1,Biden's campaign pushes abortion rights in bat...,1,
...,...,...,...
4227,Pro Palestinian radical caught mid tirade verb...,0,
4228,Texas AG is “saying the Department of Homeland...,0,


In [648]:
# Drop NaN values in the 'title' column
# df_both['title'].dropna(inplace=True)

# mask = df_both['title'].str.contains("This is a place for Republicans to discuss issues with other Republicans.")

# # Use the boolean mask to drop rows that meet the condition
# df_both = df_both[~mask]

# # Reset index if needed
# df_both.reset_index(drop=True, inplace=True)

In [649]:
df_both

Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,1,
1,Biden's campaign pushes abortion rights in bat...,1,
...,...,...,...
5420,Pro Palestinian radical caught mid tirade verb...,0,
5421,Texas AG is “saying the Department of Homeland...,0,


In [650]:
# Remove the comments from `democrats` subreddit
mask = df_filtered['title'].str.contains("{'body':")

# Invert the mask to select rows without the specified tag
df_filtered = df_filtered[~mask]

In [651]:
df_filtered.shape

(3478, 3)

In [652]:
df_filtered

Unnamed: 0,title,subreddit,comments
0,Georgia Prosecutors Had Tense Exchange on Raci...,1,
1,Biden's campaign pushes abortion rights in bat...,1,
...,...,...,...
4227,Pro Palestinian radical caught mid tirade verb...,0,
4228,Texas AG is “saying the Department of Homeland...,0,


### Export and save newly formatted Dataframe as `.csv` file

### As before in the previous file, change tag to reflect the scrapes being loaded 

In [653]:
df_filtered.to_csv('../data/final_df_to53.csv', index=False) # includes 1-52