In [1]:
# =============================================================================
# Import Libraries
# =============================================================================
import requests                     # For making API calls to CoinMarketCap
import pandas as pd                 # For data manipulation and analysis
import numpy as np                  # For numerical operations
import matplotlib.pyplot as plt     # For plotting graphs
import datetime                     # For handling date and time information
import nltk                         # For Natural Language Processing
from nltk.sentiment.vader import SentimentIntensityAnalyzer  # VADER for sentiment analysis
from sklearn.linear_model import LinearRegression           # For predictive modeling
from sklearn.metrics import mean_absolute_error, r2_score      # For model evaluation


# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [2]:
# =============================================================================
# Function to Fetch Crypto Data From CoinMarketCap API
# =============================================================================
def get_crypto_data(api_key, start=1, limit=5, convert='USD'):
    """
    Gets crypto listings form CMC API(CoinMarketCAP)

    Parameters:
    api_key: CMC API KEY
    start: This the starting rank(the crypto we're choosing/starting from. Set to 1 for Bitcoin)
    limit: This is the amount of cryptocurrencies to retrieve
    convert: the fiat currency to convert the prices

    Returns:
       Dataframe: A panda df containing crypto data 
    """
    url = "https://pro-api.coinmarketcap.com/v1/cryptocurrency/listings/latest"


    parameters = {
        'start': str(start),
        'limit': str(limit),
        'convert': convert 
    }

    headers = {
        'Accepts': 'application/json',
        'X-CMC_PRO_API_KEY': api_key
    }

    #API Request
    response = requests.get(url, headers=headers, params=parameters)
    data = response.json()

    #Turns Crypto list to dataframe
    crypto_list = data['data']
    df = pd.DataFrame(crypto_list)

    # Convert the nested 'quote' column(dictionary) into columns
    df_quote = df['quote'].apply(lambda x: x[convert])
    df_quote = pd.json_normalize(df_quote)

    if 'last_updated' in df_quote.columns:
        df_quote = df_quote.drop(columns=['last_updated'])

    # Combine the data
    df = pd.concat([df.drop(columns=['quote']), df_quote], axis=1)

    #Convert the "last_updated" column (from quote but concated to df)
    df['last_updated'] = pd.to_datetime(df['last_updated'])
    return df
     
api_key = 'c936ec13-bf84-4335-b1df-aa936ab8d2b5'
cryp_df = get_crypto_data(api_key, start=1, limit=5)
print("Data Sample")
print(cryp_df.head())

Data Sample
     id         name symbol      slug  num_market_pairs  \
0     1      Bitcoin    BTC   bitcoin             11976   
1  1027     Ethereum    ETH  ethereum             10038   
2   825  Tether USDt   USDT    tether            119215   
3    52          XRP    XRP       xrp              1550   
4  1839          BNB    BNB       bnb              2386   

                 date_added  \
0  2010-07-13T00:00:00.000Z   
1  2015-08-07T00:00:00.000Z   
2  2015-02-25T00:00:00.000Z   
3  2013-08-04T00:00:00.000Z   
4  2017-07-25T00:00:00.000Z   

                                                tags    max_supply  \
0  [mineable, pow, sha-256, store-of-value, state...  2.100000e+07   
1  [pos, smart-contracts, ethereum-ecosystem, coi...           NaN   
2  [stablecoin, asset-backed-stablecoin, ethereum...           NaN   
3  [medium-of-exchange, enterprise-solutions, xrp...  1.000000e+11   
4  [marketplace, centralized-exchange, payments, ...           NaN   

   circulating_supply  to

In [None]:
# =============================================================================
# Function: get_reddit_posts
# =============================================================================
def get_reddit_posts(subreddit, size=100, before=None, after=None):
    """
    Fetches Reddit posts from a given subreddit using the Pushshift API. With this we don't need
    to get a separate reddit api and we can specify the date range we want the data from
    
    Parameters:
        subreddit (str): Name of the subreddit (e.g., 'cryptocurrency').
        size (int): Number of posts to retrieve (default 100).
        before (int): (Optional) Timestamp to get posts before this time.
        after (int): (Optional) Timestamp to get posts after this time.
    
    Returns:
        DataFrame: A pandas DataFrame containing post timestamps and text.
    """
    
    
    base_url = "https://api.pushshift.io/reddit/search/submission/"
    # Build the parameters for the API call
    params = {
        'subreddit': subreddit,
        'size': size,
        'sort': 'desc',
        'sort_type': 'created_utc'
    }
    if before:
        params['before'] = before
    if after:
        params['after'] = after
    
    # Make the API request to Pushshift
    response = requests.get(base_url, params=params)
    data = response.json()  # Convert the response into a Python dictionary
    
    posts = data.get('data', [])
    
    #handles missing created_utc error
    if not posts:
        print("No posts were returned. Here is the raw JSON response for debugging:")
        print(data)
        # Return an empty DataFrame to avoid further errors
        return pd.DataFrame(columns=['created_utc', 'text'])
    
    
    posts = data.get('data', [])
    post_data = []
    # For each post, extract the timestamp and combine title with selftext for analysis
    for post in posts:
        created_utc = post.get('created_utc')
        # Combine title and selftext (if available)
        title = post.get('title', '')
        selftext = post.get('selftext', '')
        text = title + " " + selftext
        post_data.append({'created_utc': created_utc, 'text': text})
    
    # Convert the list of posts into a DataFrame
    df_posts = pd.DataFrame(post_data)
    
    #To handle the empty dataset error
    if df_posts.empty:
        print("No posts with 'created_utc' were found. Check the API response or parameters.")
        return df_posts
    
    # Convert UNIX timestamps (seconds since epoch) to datetime objects
    df_posts['created_utc'] = pd.to_datetime(df_posts['created_utc'], unit='s')
    return df_posts

# Example usage: Fetch posts from the 'cryptocurrency' subreddit
reddit_posts_df = get_reddit_posts(subreddit='cryptocurrency', size=200)
print("\nReddit Posts Sample:")
print(reddit_posts_df.head())

No posts were returned. Here is the raw JSON response for debugging:
{'detail': 'Not authenticated'}

Reddit Posts Sample:
Empty DataFrame
Columns: [created_utc, text]
Index: []


In [6]:
# =============================================================================
# Load Reddit Data from Kaggle
# =============================================================================
# Read the CSV file containing Reddit posts. The file should include columns like:
# 'created_utc', 'title', and 'selftext'.
reddit_kaggle_df = pd.read_csv('reddit_cryptocurrency_posts.csv')

# If there is no 'text' column, combine 'title' and 'selftext' to form a complete post text.
if 'text' not in reddit_kaggle_df.columns:
    reddit_kaggle_df['text'] = reddit_kaggle_df['title'].fillna('') + " " + reddit_kaggle_df['selftext'].fillna('')

# Convert the 'created_utc' column to datetime.
# If the values are UNIX epoch seconds, specify unit='s'; otherwise, let pandas infer the format.
if reddit_kaggle_df['created_utc'].dtype in [int, float]:
    reddit_kaggle_df['created_utc'] = pd.to_datetime(reddit_kaggle_df['created_utc'], unit='s')
else:
    reddit_kaggle_df['created_utc'] = pd.to_datetime(reddit_kaggle_df['created_utc'])

print("Loaded Reddit Data from Kaggle:")
print(reddit_kaggle_df.head())


FileNotFoundError: [Errno 2] No such file or directory: 'reddit_cryptocurrency_posts.csv'