<a href="https://colab.research.google.com/github/Aayush015/Research_crypto_price_analysis/blob/main/Crypto_Price_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Description of the Project

This project aims to get crypto currency price predictions for the year 2024 based on news media sentiments, and reddit posts sentiments from the year 2017 - 2023. The idea is that we will collect hourly 10 popular reddit posts from 2017 - 2023, and collect hourly 10 popular news for four cryptos: bitcon, etherum, shiba-inu, and dogecoin. We will use this data with the data we collected for hourly crypto price data, and train a machine learning model to learn patterns. We will use this model to forecast the data for 2024, and test it against actual crypto fluctuations for 2024.

# Data Collection
First, we will collect crypto price data, reddit data, and traditional news data each from their own APIs.

In [2]:
#!/usr/bin/env python
import requests
import pandas as pd
from datetime import datetime, timedelta
import time

def fetch_crypto_data_hourly(crypto_id, start_date, end_date):
    """
    Fetch hourly historical price data for a cryptocurrency from CoinCap API.

    :param crypto_id: Cryptocurrency ID (e.g., 'bitcoin', 'ethereum').
    :param start_date: Start date (datetime object).
    :param end_date: End date (datetime object).
    :return: List of historical hourly data for the cryptocurrency.
    """
    url = f"https://api.coincap.io/v2/assets/{crypto_id}/history"
    all_data = []

    # Fetch data in intervals to avoid API limitations
    while start_date < end_date:
        interval_end = min(start_date + timedelta(days=30), end_date)  # Fetch 1 month at a time
        params = {
            "interval": "h1",  # Hourly data
            "start": int(start_date.timestamp() * 1000),  # Start timestamp in ms
            "end": int(interval_end.timestamp() * 1000),  # End timestamp in ms
        }

        try:
            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json().get("data", [])
            # Add the data to the collection
            all_data.extend(data)
        except requests.exceptions.RequestException as ex:
            print(f"Error fetching data for {crypto_id}: {ex}")
            break

        start_date = interval_end + timedelta(hours=1)  # Move to the next interval
        time.sleep(1)  # Sleep to respect API rate limits

    return all_data

def save_to_csv(data, filename):
    """
    Save data to a CSV file.

    :param data: List of historical price data.
    :param filename: Output CSV filename.
    """
    if not data:
        print("No data to save.")
        return

    # Convert the data to a pandas DataFrame
    df = pd.DataFrame(data)
    # Ensure timestamps are in human-readable format
    df["time"] = pd.to_datetime(df["time"], unit="ms")
    # Rename columns for clarity
    df.rename(columns={"priceUsd": "price_usd"}, inplace=True)
    df.to_csv(filename, index=False)
    print(f"Data saved to {filename}")

def run():
    """
    Main function to fetch and save hourly historical data for specified cryptocurrencies.
    """
    # Cryptocurrencies of interest
    cryptos = {
        "bitcoin": "bitcoin",
        "ethereum": "ethereum",
        "dogecoin": "dogecoin",
        "shiba-inu": "shiba-inu",
        "uniswap": "uniswap",
        "aave": "aave",
        "compound": "compound",
        "flow": "flow",
        "decentraland": "decentraland",
        "the-sandbox": "the-sandbox",
    }

    # Start and end dates
    start_date = datetime(2021, 10, 12)  # Start date (YYYY, MM, DD)
    end_date = datetime(2023, 12, 19)    # End date (YYYY, MM, DD)

    for crypto_name, crypto_id in cryptos.items():
        print(f"Fetching hourly data for {crypto_name}...")
        data = fetch_crypto_data_hourly(crypto_id, start_date, end_date)

        # Add a column identifying the cryptocurrency
        for entry in data:
            entry["crypto"] = crypto_name

        # Save data to a CSV file
        filename = f"{crypto_name}_hourly_2021-10-12_to_2023-12-19.csv"
        save_to_csv(data, filename)

    print("Hourly data collection completed.")

if __name__ == "__main__":
    run()


Fetching hourly data for bitcoin...
Data saved to bitcoin_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for ethereum...
Data saved to ethereum_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for dogecoin...
Data saved to dogecoin_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for shiba-inu...
Data saved to shiba-inu_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for uniswap...
Data saved to uniswap_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for aave...
Data saved to aave_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for compound...
Data saved to compound_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for flow...
Data saved to flow_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for decentraland...
Data saved to decentraland_hourly_2021-10-12_to_2023-12-19.csv
Fetching hourly data for the-sandbox...
Data saved to the-sandbox_hourly_2021-10-12_to_2023-12-19.csv
Hourly data collection completed.


# Merge Altcoins into one, Nfts into one, and Defi into one.

In [3]:
import pandas as pd
from datetime import datetime

# Define category mappings
altcoin_coins = ["shiba-inu", "dogecoin"]
nft_coins = ["flow", "decentraland", "the-sandbox"]
defi_coins = ["uniswap", "aave", "compound"]
standalone_coins = ["bitcoin", "ethereum"]

# Function to load and process price data for each category
def load_price_data(coin_list, category_label):
    combined_df = pd.DataFrame()
    for coin in coin_list:
        file_path = f"{coin}_hourly_2021-10-12_to_2023-12-19.csv"
        df = pd.read_csv(file_path)
        df["time"] = pd.to_datetime(df["time"])
        df["category"] = category_label
        df["coin"] = coin
        combined_df = pd.concat([combined_df, df], ignore_index=True)
    return combined_df

# Process each category
altcoin_data = load_price_data(altcoin_coins, "altcoin")
nft_data = load_price_data(nft_coins, "nft")
defi_data = load_price_data(defi_coins, "defi")
bitcoin_data = load_price_data(["bitcoin"], "bitcoin")
ethereum_data = load_price_data(["ethereum"], "ethereum")

# Combine all categories into a single dataset
all_price_data = pd.concat([altcoin_data, nft_data, defi_data, bitcoin_data, ethereum_data], ignore_index=True)

# Save combined price data to a file
all_price_data.to_csv("combined_price_data.csv", index=False)
print("Combined price data saved to 'combined_price_data.csv'")

Combined price data saved to 'combined_price_data.csv'


## Merge News and Price data

* Preprocess news data (sentiment components get extracted into a separated columns using eval()).
* Round news data to its nearest hour.

In [10]:
import pandas as pd
import ast  # For safely evaluating the sentiment dictionary string

# Load the datasets
news_data = pd.read_csv("cryptonews.csv")
price_data = pd.read_csv("combined_price_data.csv")

# Ensure proper datetime format
news_data['date'] = pd.to_datetime(news_data['date'], format='mixed', errors='coerce')  # Handle mixed formats
price_data['time'] = pd.to_datetime(price_data['time'])

# Round news timestamps to the nearest hour
news_data['rounded_time'] = news_data['date'].dt.round('h')

# Parse the sentiment dictionary into separate columns
def parse_sentiment(sentiment_str):
    try:
        sentiment_dict = ast.literal_eval(sentiment_str)  # Safely evaluate the string
        return pd.Series({
            "sentiment_class": sentiment_dict.get("class"),
            "sentiment_polarity": sentiment_dict.get("polarity"),
            "sentiment_subjectivity": sentiment_dict.get("subjectivity")
        })
    except:
        return pd.Series({
            "sentiment_class": None,
            "sentiment_polarity": None,
            "sentiment_subjectivity": None
        })

# Apply parsing function
sentiment_data = news_data['sentiment'].apply(parse_sentiment)
news_data = pd.concat([news_data, sentiment_data], axis=1)

# Merge the news data with price data
merged_data = pd.merge(price_data, news_data, left_on='time', right_on='rounded_time', how='left')

# Save the merged dataset
merged_data.to_csv("merged_price_news_data.csv", index=False)
print("Merged data saved to 'merged_price_news_data.csv'")

Merged data saved to 'merged_price_news_data.csv'


# Handle Missing Values