# Statistics of the dataset

In [13]:
import pandas as pd
import numpy as np

In [14]:
df = pd.read_csv('crypto_timeseries.csv')

In [15]:
df.describe()

Unnamed: 0,Date,Price,Open,High,Low,Vol.,Change %
count,156,156.0,156.0,156.0,156.0,156,156
unique,156,156.0,156.0,156.0,156.0,154,137
top,12/05/2025,89248.5,92082.8,92676.4,88987.5,43.05K,-0.47%
freq,1,1.0,1.0,1.0,1.0,2,3


In [16]:
print("Dataset Statistics:")
print(f"Number of records: {len(df)}")
print(f"Columns: {df.columns.tolist()}")
print(f"Missing values:\n{df.isnull().sum()}")

Dataset Statistics:
Number of records: 156
Columns: ['Date', 'Price', 'Open', 'High', 'Low', 'Vol.', 'Change %']
Missing values:
Date        0
Price       0
Open        0
High        0
Low         0
Vol.        0
Change %    0
dtype: int64


In [17]:
print(df.head())

         Date     Price      Open      High       Low     Vol. Change %
0  12/05/2025  89,248.5  92,082.8  92,676.4  88,987.5   55.81K   -3.08%
1  12/04/2025  92,082.8  93,437.2  94,043.9  90,933.2   60.28K   -1.45%
2  12/03/2025  93,437.2  91,286.8  94,077.0  91,020.8   79.02K    2.35%
3  12/02/2025  91,287.6  86,315.2  92,285.4  86,189.8   87.36K    5.77%
4  12/01/2025  86,309.1  90,372.2  90,372.2  83,858.1  105.96K   -4.50%


In [22]:
print(df.dtypes)

Date        datetime64[ns]
Price               object
Open                object
High                object
Low                 object
Vol.                object
Change %            object
dtype: object


##XGBOOST

In [28]:
#Features creation for XGBOOST model

import pandas as pd
import numpy as np

def create_features_for_xgboost(df):
    # --- Step 0: Data Cleaning ---
    # Convert 'Date' to datetime objects and sort chronologically
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.set_index('Date')
    df.index = pd.to_datetime(df.index)
    df = df.sort_index()
    
    # Clean 'Vol.' (e.g., "10K" -> 10000) and 'Change %' if they are strings
    if df['Vol.'].dtype == object:
        df['Vol.'] = df['Vol.'].replace({'K': '*1e3', 'M': '*1e6', 'B': '*1e9'}, regex=True).map(pd.eval).astype(float)
    if df['Change %'].dtype == object:
        df['Change %'] = df['Change %'].str.replace('%', '').astype(float)

    # --- Step 1: Feature Engineering ---
    
    # 1. Log Returns
    # We use log returns because they are time-additive and more stable than raw prices
    df['log_ret'] = np.log(df['Price'] / df['Price'].shift(1))
    
    # 2. Lag Features (Memory)
    # What happened yesterday? (Lag 1)
    df['ret_lag1'] = df['log_ret'].shift(1)
    # What happened one week ago? (Lag 7 - captures weekly cycles)
    df['ret_lag7'] = df['log_ret'].shift(7)
    # Volume from yesterday (Volume often precedes price movement)
    df['vol_lag1'] = df['Vol.'].shift(1)
    
    # 3. Rolling Statistics (Trend & Volatility)
    # 30-day Moving Average (General monthly trend)
    df['ma_30'] = df['Price'].rolling(window=30).mean()
    
    # 30-day Standard Deviation (Is the market calm or chaotic?)
    df['std_30'] = df['log_ret'].rolling(window=30).std()
    
    # 4. Technical Indicators / Market Psychology
    # Distance from the Moving Average:
    # If positive/high -> Price is far above average (Potential bubble/overbought)
    # If negative/low -> Price is far below average (Potential oversold)
    df['dist_ma30'] = (df['Price'] / df['ma_30']) - 1
    
    # Daily Range: Normalized difference between High and Low
    # Shows intraday panic or euphoria
    df['daily_range'] = (df['High'] - df['Low']) / df['Price']

    # 5. Time Features
    # Crypto markets behave differently on weekends (Banks are closed)
    # 0 = Monday, 6 = Sunday
    df['day_of_week'] = df.index.dayofweek
    
    # --- Step 2: The Target ---
    # We want to predict the Price 7 days into the future.
    # On calcule la variation entre le prix dans 7 jours et le prix d'aujourd'hui
    df['target'] = np.log(df['Price'].shift(-7) / df['Price'])

    # Remove empty rows (NaNs) created by shifting and rolling
    df = df.dropna()
    
    return df

In [None]:
#Convert to float
df['Price'] = df['Price'].astype(str).str.replace(',', '').str.replace('$', '')
df['Price'] = df['Price'].astype(float)

df['High'] = df['High'].astype(str).str.replace(',', '').str.replace('$', '')
df['High'] = df['High'].astype(float)

df['Low'] = df['Low'].astype(str).str.replace(',', '').str.replace('$', '')
df['Low'] = df['Low'].astype(float)

df['Open'] = df['Open'].astype(str).str.replace(',', '').str.replace('$', '')
df['Open'] = df['Open'].astype(float)

df = create_features_for_xgboost(df)

In [30]:
df.head()

Unnamed: 0_level_0,Price,Open,High,Low,Vol.,Change %,log_ret,ret_lag1,ret_lag7,vol_lag1,ma_30,std_30,dist_ma30,daily_range,day_of_week,target_7d,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2025-09-01,109243.3,108247.3,109898.5,107274.7,53130.0,0.94,0.009348,-0.005449,-0.030099,33800.0,114767.146667,0.018812,-0.048131,0.024018,0,112068.4,0.025532
2025-09-02,111218.3,109241.6,111714.6,108425.3,60020.0,1.81,0.017917,0.009348,0.014875,53130.0,114667.32,0.018914,-0.030078,0.029575,1,111516.5,0.002678
2025-09-03,111716.5,111227.9,112568.1,110547.9,43050.0,0.45,0.004469,0.017917,-0.0046,60020.0,114556.026667,0.018878,-0.024787,0.018083,2,113963.5,0.019914
2025-09-04,110712.7,111716.5,112189.1,109344.8,43300.0,-0.9,-0.009026,0.004469,0.011735,43050.0,114441.813333,0.018892,-0.032585,0.025691,3,115490.2,0.042247
2025-09-05,110652.2,110727.9,113315.6,110234.6,63560.0,-0.05,-0.000547,-0.009026,-0.038147,43300.0,114296.816667,0.018823,-0.031887,0.027844,4,116042.8,0.047567


In [26]:
features_to_drop = ['target', 'Open', 'High', 'Low', 'Change %'] # On garde Price pour référence si besoin, sinon on l'enlève
X = df.drop(columns=features_to_drop)

# y = Ce qu'on veut prédire
y = df['target']

KeyError: "['target'] not found in axis"