In [1]:
#pip install python-dotenv

In [2]:
import pandas as pd
import os
from dotenv import load_dotenv
import json
import re
import requests
from extract_reddit_data import RedditAPI
import numpy as np

In [3]:
reddit_api = RedditAPI()
cryptocurrency = reddit_api.get_crypto_curr_data()
cryptomarkets = reddit_api.get_crypto_markets_data()
bitcoin = reddit_api.get_bitcoin_data()
eth= reddit_api.get_eth_data()
binance = reddit_api.get_binance_data()
solana = reddit_api.get_solana_data()
ripple = reddit_api.get_ripple_data()
cardano = reddit_api.get_cardano_data()
tronix = reddit_api.get_tronix_data()
chainlink = reddit_api.get_chainlink_data()

## understand the Databases 


In [5]:
def print_dataframe_shape(df, name):
    rows, columns = df.shape
    print(f"{name} database has {rows} rows and {columns} columns.")

print_dataframe_shape(cryptocurrency, "Cryptocurrency")
print_dataframe_shape(cryptomarkets, "Crypto Markets")
print_dataframe_shape(bitcoin, "Bitcoin")
print_dataframe_shape(eth, "Ethereum")
print_dataframe_shape(binance, "Binance")
print_dataframe_shape(solana, "Solana")
print_dataframe_shape(ripple, "Ripple")
print_dataframe_shape(cardano, "Cardano")
print_dataframe_shape(tronix, "Tronix")
print_dataframe_shape(chainlink, "Chainlink")

Cryptocurrency database has 102 rows and 9 columns.
Crypto Markets database has 101 rows and 9 columns.
Bitcoin database has 102 rows and 9 columns.
Ethereum database has 100 rows and 9 columns.
Binance database has 101 rows and 9 columns.
Solana database has 102 rows and 9 columns.
Ripple database has 102 rows and 9 columns.
Cardano database has 102 rows and 9 columns.
Tronix database has 100 rows and 9 columns.
Chainlink database has 100 rows and 9 columns.


In [6]:
def dataframe_description(df, name):
    print(f"{name} description:")
    info = df.describe()
    return info

# Print description for each DataFrame
print(dataframe_description(cryptocurrency, "Cryptocurrency"))
print(dataframe_description(cryptomarkets, "Crypto Markets"))
print(dataframe_description(bitcoin, "Bitcoin"))
print(dataframe_description(eth, "Ethereum"))
print(dataframe_description(binance, "Binance"))
print(dataframe_description(solana, "Solana"))
print(dataframe_description(ripple, "Ripple"))
print(dataframe_description(cardano, "Cardano"))
print(dataframe_description(tronix, "Tronix"))
print(dataframe_description(chainlink, "Chainlink"))

Cryptocurrency description:
       upvote_ratio          ups  downs        score
count    102.000000   102.000000  102.0   102.000000
mean       0.628431   108.382353    0.0   108.382353
std        0.237407   338.261076    0.0   338.261076
min        0.100000     0.000000    0.0     0.000000
25%        0.450000     0.000000    0.0     0.000000
50%        0.655000    13.000000    0.0    13.000000
75%        0.847500    52.750000    0.0    52.750000
max        0.950000  2831.000000    0.0  2831.000000
Crypto Markets description:
       upvote_ratio         ups  downs       score
count    101.000000  101.000000  101.0  101.000000
mean       0.722376    4.722772    0.0    4.722772
std        0.228434   10.624612    0.0   10.624612
min        0.250000    0.000000    0.0    0.000000
25%        0.600000    1.000000    0.0    1.000000
50%        0.670000    1.000000    0.0    1.000000
75%        1.000000    4.000000    0.0    4.000000
max        1.000000   80.000000    0.0   80.000000
Bitcoin 

In [7]:
def missing_values(df, name):
    print(f"{name} number of missing values:")
    missing_values = df.isna().sum()
    print(missing_values)
    return missing_values

# Checking missing values for each DataFrame
print(missing_values(cryptocurrency, "Cryptocurrency"))
print(missing_values(cryptomarkets, "Crypto Markets"))
print(missing_values(bitcoin, "Bitcoin"))
print(missing_values(eth, "Ethereum"))
print(missing_values(binance, "Binance"))
print(missing_values(solana, "Solana"))
print(missing_values(ripple, "Ripple"))
print(missing_values(cardano, "Cardano"))
print(missing_values(tronix, "Tronix"))
print(missing_values(chainlink, "Chainlink"))

Cryptocurrency number of missing values:
approved_date    102
thread_id          0
subreddit          0
title              0
body               0
upvote_ratio       0
ups                0
downs              0
score              0
dtype: int64
approved_date    102
thread_id          0
subreddit          0
title              0
body               0
upvote_ratio       0
ups                0
downs              0
score              0
dtype: int64
Crypto Markets number of missing values:
approved_date    101
thread_id          0
subreddit          0
title              0
body               0
upvote_ratio       0
ups                0
downs              0
score              0
dtype: int64
approved_date    101
thread_id          0
subreddit          0
title              0
body               0
upvote_ratio       0
ups                0
downs              0
score              0
dtype: int64
Bitcoin number of missing values:
approved_date    102
thread_id          0
subreddit          0
title        

In [8]:
def clean_body_column(df, name):
    print(f"Cleaning 'body' column for {name} database.")
    # Replace empty strings with NaN
    df['body'].replace('', np.nan, inplace=True)
    # Drop rows with NaN values in the 'body' column
    df.dropna(subset=['body'], inplace=True)
    print(f"Number of rows after cleaning: {len(df)}")
    return df

# Cleaning 'body' column for each DataFrame
cryptocurrency = clean_body_column(cryptocurrency, "Cryptocurrency")
cryptomarkets = clean_body_column(cryptomarkets, "Crypto Markets")
bitcoin = clean_body_column(bitcoin, "Bitcoin")
eth = clean_body_column(eth, "Ethereum")
binance = clean_body_column(binance, "Binance")
solana = clean_body_column(solana, "Solana")
ripple = clean_body_column(ripple, "Ripple")
cardano = clean_body_column(cardano, "Cardano")
tronix = clean_body_column(tronix, "Tronix")
chainlink = clean_body_column(chainlink, "Chainlink")

Cleaning 'body' column for Cryptocurrency database.
Number of rows after cleaning: 46
Cleaning 'body' column for Crypto Markets database.
Number of rows after cleaning: 55
Cleaning 'body' column for Bitcoin database.
Number of rows after cleaning: 67
Cleaning 'body' column for Ethereum database.
Number of rows after cleaning: 72
Cleaning 'body' column for Binance database.
Number of rows after cleaning: 84
Cleaning 'body' column for Solana database.
Number of rows after cleaning: 97
Cleaning 'body' column for Ripple database.
Number of rows after cleaning: 90
Cleaning 'body' column for Cardano database.
Number of rows after cleaning: 56
Cleaning 'body' column for Tronix database.
Number of rows after cleaning: 76
Cleaning 'body' column for Chainlink database.
Number of rows after cleaning: 65


In [9]:
def drop_approved_date_column(df, name):
    if 'approved_date' in df.columns:
        df = df.drop(columns=['approved_date'])
        print(f"'approved_date' column removed from {name} database.")
    else:
        print(f"No 'approved_date' column found in {name} database.")
    return df

# Dropping 'approved_date' column from each DataFrame
cryptocurrency = drop_approved_date_column(cryptocurrency, "Cryptocurrency")
cryptomarkets = drop_approved_date_column(cryptomarkets, "Crypto Markets")
bitcoin = drop_approved_date_column(bitcoin, "Bitcoin")
eth = drop_approved_date_column(eth, "Ethereum")
binance = drop_approved_date_column(binance, "Binance")
solana = drop_approved_date_column(solana, "Solana")
ripple = drop_approved_date_column(ripple, "Ripple")
cardano = drop_approved_date_column(cardano, "Cardano")
tronix = drop_approved_date_column(tronix, "Tronix")
chainlink = drop_approved_date_column(chainlink, "Chainlink")

'approved_date' column removed from Cryptocurrency database.
'approved_date' column removed from Crypto Markets database.
'approved_date' column removed from Bitcoin database.
'approved_date' column removed from Ethereum database.
'approved_date' column removed from Binance database.
'approved_date' column removed from Solana database.
'approved_date' column removed from Ripple database.
'approved_date' column removed from Cardano database.
'approved_date' column removed from Tronix database.
'approved_date' column removed from Chainlink database.


In [12]:
def duplicated_dataframe(df, name):
    duplicate = df.duplicated().sum()
    print(f"{name} has {duplicate} duplicate values.")

# Checking duplicate for each DataFrame
duplicated_dataframe(cryptocurrency, "Cryptocurrency")
duplicated_dataframe(cryptomarkets, "Crypto Markets")
duplicated_dataframe(bitcoin, "Bitcoin")
duplicated_dataframe(eth, "Ethereum")
duplicated_dataframe(binance, "Binance")
duplicated_dataframe(solana, "Solana")
duplicated_dataframe(ripple, "Ripple")
duplicated_dataframe(cardano, "Cardano")
duplicated_dataframe(tronix, "Tronix")
duplicated_dataframe(chainlink, "Chainlink")

Cryptocurrency has 0 duplicate values.
Crypto Markets has 0 duplicate values.
Bitcoin has 0 duplicate values.
Ethereum has 0 duplicate values.
Binance has 0 duplicate values.
Solana has 0 duplicate values.
Ripple has 0 duplicate values.
Cardano has 0 duplicate values.
Tronix has 0 duplicate values.
Chainlink has 0 duplicate values.


In [10]:
binance.duplicated().sum()

0