## Read and structure all Reddit data

In [1]:
# Import libraries
import pandas as pd

In [2]:
path_df_3_filtered = "data/df_3_filtered.csv"
path_df_2_filtered = "data/df_2_filtered.csv"
path_df_1_filtered = "data/df_1_filtered.csv"
path_df_filtered = "data/df_filtered.csv"

In [3]:
df = pd.read_csv(path_df_filtered)
df_1 = pd.read_csv(path_df_1_filtered)
df_2 = pd.read_csv(path_df_2_filtered)
df_3 = pd.read_csv(path_df_3_filtered)

In [4]:
# Combine two dataframes
df_all_reddit = pd.concat([df, df_1, df_2, df_3])
df_all_reddit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 281963 entries, 0 to 58928
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   subreddit           281963 non-null  object
 1   keyword             281963 non-null  object
 2   title               281963 non-null  object
 3   text                223460 non-null  object
 4   time_posted         281963 non-null  object
 5   url                 281963 non-null  object
 6   number_of_comments  281963 non-null  int64 
 7   score               281963 non-null  int64 
 8   author              281963 non-null  object
dtypes: int64(2), object(7)
memory usage: 21.5+ MB


In [6]:
df_all_reddit = df_all_reddit.convert_dtypes()
df_all_reddit.info()

<class 'pandas.core.frame.DataFrame'>
Index: 281963 entries, 0 to 58928
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   subreddit           281963 non-null  string
 1   keyword             281963 non-null  string
 2   title               281963 non-null  string
 3   text                223460 non-null  string
 4   time_posted         281963 non-null  string
 5   url                 281963 non-null  string
 6   number_of_comments  281963 non-null  Int64 
 7   score               281963 non-null  Int64 
 8   author              281963 non-null  string
dtypes: Int64(2), string(7)
memory usage: 22.0 MB


In [8]:
# Discard entries containing NaN date
df_reddit_clean = df_all_reddit.dropna(subset=["time_posted"])
df_reddit_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 281963 entries, 0 to 58928
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   subreddit           281963 non-null  string
 1   keyword             281963 non-null  string
 2   title               281963 non-null  string
 3   text                223460 non-null  string
 4   time_posted         281963 non-null  string
 5   url                 281963 non-null  string
 6   number_of_comments  281963 non-null  Int64 
 7   score               281963 non-null  Int64 
 8   author              281963 non-null  string
dtypes: Int64(2), string(7)
memory usage: 22.0 MB


In [9]:
# Discard unused columns
df_reddit_clean = df_reddit_clean.drop(columns=["url"])
df_reddit_clean.count()

subreddit             281963
keyword               281963
title                 281963
text                  223460
time_posted           281963
number_of_comments    281963
score                 281963
author                281963
dtype: int64

In [10]:
# Convert data types
df_reddit_clean["date"] = pd.to_datetime(df_reddit_clean["time_posted"], errors="coerce")
df_reddit_clean.head()

Unnamed: 0,subreddit,keyword,title,text,time_posted,number_of_comments,score,author,date
0,CryptoCurrency,Bitcoin,Bitcoin Set to Become More Dominant Even as BT...,,2024-04-30 12:30:17,2,4,kirtash93,2024-04-30 12:30:17
1,CryptoCurrency,Bitcoin,Hong Kong Welcomes Spot Bitcoin and Ethereum E...,,2024-04-30 11:51:32,4,10,asso,2024-04-30 11:51:32
2,CryptoCurrency,Bitcoin,"Except solely HODLING BTC, diversification is ...",TLDR: Buying bitcoin is the always the best op...,2024-04-30 11:42:26,4,4,DecentralizeCosmos,2024-04-30 11:42:26
3,CryptoCurrency,Bitcoin,"Bitcoin, Ethereum spot ETFs start trading in H...",,2024-04-30 09:42:01,11,14,0xJonnyDee,2024-04-30 09:42:01
4,CryptoCurrency,Bitcoin,"MicroStrategy Adds 122 BTC for $7.8M, Now Hold...",,2024-04-30 08:08:48,37,81,OcelotWarm8822,2024-04-30 08:08:48


## Write clean data into a parquet file

In [11]:
df_reddit_clean.to_parquet("data/reddit_clean.parquet")