In [None]:
import pandas as pd

# Load the dataset
file_path = "old_datasets\Bitcoin_01_01_2024-31_12_2024_historical_data_coinmarketcap.csv"
df = pd.read_csv(file_path)

# Display the first few rows to understand the structure
df.head()


Unnamed: 0,timeOpen;timeClose;timeHigh;timeLow;name;open;high;low;close;volume;marketCap;timestamp
0,"2024-12-31T00:00:00.000Z;""2024-12-31T23:59:59...."
1,"2024-12-30T00:00:00.000Z;""2024-12-30T23:59:59...."
2,"2024-12-29T00:00:00.000Z;""2024-12-29T23:59:59...."
3,"2024-12-28T00:00:00.000Z;""2024-12-28T23:59:59...."
4,"2024-12-27T00:00:00.000Z;""2024-12-27T23:59:59...."


In [3]:
# Reload the dataset with the correct delimiter
df = pd.read_csv(file_path, delimiter=";")

# Convert timeOpen to datetime format for better analysis
df["timeOpen"] = pd.to_datetime(df["timeOpen"])

# Sort data by timeOpen in ascending order
df = df.sort_values(by="timeOpen").reset_index(drop=True)

# Check for missing values
missing_values = df.isnull().sum()

# Remove duplicates if any
df = df.drop_duplicates()

# Display dataset info after cleaning
df_info = df.info()

# # Display the first few rows after cleaning
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 365 entries, 0 to 364
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   timeOpen   365 non-null    datetime64[ns, UTC]
 1   timeClose  365 non-null    object             
 2   timeHigh   365 non-null    object             
 3   timeLow    365 non-null    object             
 4   name       365 non-null    int64              
 5   open       365 non-null    float64            
 6   high       365 non-null    float64            
 7   low        365 non-null    float64            
 8   close      365 non-null    float64            
 9   volume     365 non-null    float64            
 10  marketCap  365 non-null    float64            
 11  timestamp  365 non-null    object             
dtypes: datetime64[ns, UTC](1), float64(6), int64(1), object(4)
memory usage: 34.3+ KB


Unnamed: 0,timeOpen,timeClose,timeHigh,timeLow,name,open,high,low,close,volume,marketCap,timestamp
0,2024-01-02 00:00:00+00:00,2024-01-02T23:59:59.999Z,2024-01-02T09:07:00.000Z,2024-01-02T00:01:00.000Z,2781,44187.139039,45899.706355,44176.950443,44957.970054,39335270000.0,880631600000.0,2024-01-02T23:59:59.999Z
1,2024-01-03 00:00:00+00:00,2024-01-03T23:59:59.999Z,2024-01-03T09:40:00.000Z,2024-01-03T12:10:00.000Z,2781,44961.603187,45503.241795,40813.534868,42848.175641,46342320000.0,839345900000.0,2024-01-03T23:59:59.999Z
2,2024-01-04 00:00:00+00:00,2024-01-04T23:59:59.999Z,2024-01-04T22:22:00.000Z,2024-01-04T01:26:00.000Z,2781,42855.814761,44770.023414,42675.174007,44179.921701,30448090000.0,865482500000.0,2024-01-04T23:59:59.999Z
3,2024-01-05 00:00:00+00:00,2024-01-05T23:59:59.999Z,2024-01-05T23:07:00.000Z,2024-01-05T01:49:00.000Z,2781,44192.978699,44353.285859,42784.718266,44162.69163,32336030000.0,865182700000.0,2024-01-05T23:59:59.999Z
4,2024-01-06 00:00:00+00:00,2024-01-06T23:59:59.999Z,2024-01-06T00:10:00.000Z,2024-01-06T07:07:00.000Z,2781,44178.954607,44227.631307,43475.156431,43989.194093,16092500000.0,861822000000.0,2024-01-06T23:59:59.999Z


In [None]:
# Load the second dataset (2023 data)
file_path_2023 = "old_datasets\Bitcoin_01_01_2023-31_12_2023_historical_data_coinmarketcap.csv"
df_2023 = pd.read_csv(file_path_2023, delimiter=";")

# Convert timeOpen to datetime format for both datasets
df_2023["timeOpen"] = pd.to_datetime(df_2023["timeOpen"])
df["timeOpen"] = pd.to_datetime(df["timeOpen"])

# Combine the two datasets
df_combined = pd.concat([df_2023, df], ignore_index=True)

# Remove duplicates if any
df_combined = df_combined.drop_duplicates()

# Sort data by timeOpen in ascending order
df_combined = df_combined.sort_values(by="timeOpen").reset_index(drop=True)

# Display the cleaned and combined dataset
df_combined.head()

#convert the combined data to a csv file
df_combined.to_csv("old_datasets\Combined_Bitcoin_01_01_2023-31_12_2024_historical_data_coinmarketcap.csv", index=False)

print("The combined dataset has been saved as Combined_Bitcoin_01_01_2023-31_12_2024_historical_data_coinmarketcap.csv")

The combined dataset has been saved as Combined_Bitcoin_01_01_2023-31_12_2024_historical_data_coinmarketcap.csv


In [7]:
# Feature Engineering: Adding Moving Averages & Volatility

# Ensure numeric columns are correctly formatted
numeric_columns = ["open", "high", "low", "close", "volume", "marketCap"]
df_combined[numeric_columns] = df_combined[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Create Moving Averages (7-day, 30-day)
df_combined["SMA_7"] = df_combined["close"].rolling(window=7).mean()
df_combined["SMA_30"] = df_combined["close"].rolling(window=30).mean()

# Create Volatility Indicator (Rolling Standard Deviation)
df_combined["Volatility"] = df_combined["close"].rolling(window=7).std()

# Fill missing values generated by rolling calculations
df_combined.fillna(method='bfill', inplace=True)

# Save the enhanced dataset
enhanced_file_path = "Enhanced_Bitcoin_2023_2024.csv"
df_combined.to_csv(enhanced_file_path, index=False)

# Display the enhanced dataset to the user
df_combined.head()


  df_combined.fillna(method='bfill', inplace=True)


Unnamed: 0,timeOpen,timeClose,timeHigh,timeLow,name,open,high,low,close,volume,marketCap,timestamp,SMA_7,SMA_30,Volatility
0,2023-01-02 00:00:00+00:00,2023-01-02T23:59:59.999Z,2023-01-02T08:19:00.000Z,2023-01-02T01:19:00.000Z,2781,16625.510394,16759.344023,16572.22885,16688.471357,12097780000.0,321262500000.0,2023-01-02T23:59:59.999Z,16866.642106,20371.872144,148.908396
1,2023-01-03 00:00:00+00:00,2023-01-03T23:59:59.999Z,2023-01-03T06:32:00.000Z,2023-01-03T17:48:00.000Z,2781,16688.847264,16760.447466,16622.370328,16679.85708,13903080000.0,321112000000.0,2023-01-03T23:59:59.999Z,16866.642106,20371.872144,148.908396
2,2023-01-04 00:00:00+00:00,2023-01-04T23:59:59.999Z,2023-01-04T18:55:00.000Z,2023-01-04T00:50:00.000Z,2781,16680.204971,16964.585185,16667.763725,16863.238258,18421740000.0,324658200000.0,2023-01-04T23:59:59.999Z,16866.642106,20371.872144,148.908396
3,2023-01-05 00:00:00+00:00,2023-01-05T23:59:59.999Z,2023-01-05T00:21:00.000Z,2023-01-05T14:04:00.000Z,2781,16863.472057,16884.022116,16790.283041,16836.736645,13692760000.0,324163900000.0,2023-01-05T23:59:59.999Z,16866.642106,20371.872144,148.908396
4,2023-01-06 00:00:00+00:00,2023-01-06T23:59:59.999Z,2023-01-06T20:19:00.000Z,2023-01-06T13:14:00.000Z,2781,16836.473439,16991.994121,16716.422271,16951.969517,14413660000.0,326401300000.0,2023-01-06T23:59:59.999Z,16866.642106,20371.872144,148.908396
