In [1]:
import pandas as pd
import requests
import datetime
import time

In [2]:
class BinanceIngestionData:
    def __init__(self, symbol, interval, start_date, end_date, max_retries=5, retry_delay=2):
        self.symbol = symbol
        self.interval = interval
        self.start_date = start_date
        self.end_date = end_date
        self.max_retries = max_retries
        self.retry_delay = retry_delay
        self.base_url = "https://api.binance.com/api/v3/klines"

    def fetch_data(self):
        params = {
            "symbol": self.symbol,
            "interval": self.interval,
            "startTime": int(datetime.datetime.strptime(self.start_date, "%Y-%m-%d").timestamp() * 1000),
            "endTime": int(datetime.datetime.strptime(self.end_date, "%Y-%m-%d").timestamp() * 1000),
            "limit": 1000
        }
        
        for attempt in range(1, self.max_retries + 1):
            try:
                response = requests.get(self.base_url, params=params)
                response.raise_for_status()  # Raise an HTTPError for bad responses
                data = response.json()

                if not data:
                    raise Exception("No data returned from Binance API. Check your parameters.")
                
                return data
            except Exception as e:
                print(f"Attempt {attempt} failed: {e}")
                if attempt == self.max_retries:
                    print("Max retries reached. Exiting.")
                    raise
                time.sleep(self.retry_delay)  # Wait before retrying

    def process_data(self, data):
        df = pd.DataFrame(data, columns=[
            "Open Time", "Open", "High", "Low", "Close", "Volume", "Close Time", 
            "Quote Asset Volume", "Number of Trades", "Taker Buy Base Asset Volume", 
            "Taker Buy Quote Asset Volume", "Ignore"
        ])

        # Convert to proper data types
        df["Open Time"] = pd.to_datetime(df["Open Time"], unit="ms")
        df.set_index("Open Time", inplace=True)
        df = df.astype({
            "Open": "float", 
            "High": "float", 
            "Low": "float", 
            "Close": "float", 
            "Volume": "float", 
            "Quote Asset Volume": "float", 
            "Number of Trades": "int", 
            "Taker Buy Base Asset Volume": "float", 
            "Taker Buy Quote Asset Volume": "float"
        })

        return df

    def save_to_csv(self, df, file_path):
        df.to_csv(file_path)
        print(f"Data saved to {file_path}")


if __name__ == "__main__":
    symbol = "BTCUSDT"
    interval = "1d"
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")
    start_date = (datetime.datetime.now() - datetime.timedelta(days=730)).strftime("%Y-%m-%d")

    binance_data = BinanceIngestionData(symbol, interval, start_date, end_date)

    try:
        raw_data = binance_data.fetch_data()
        processed_data = binance_data.process_data(raw_data)
        binance_data.save_to_csv(processed_data, "BTC_2Y_Binance.csv")
    except Exception as e:
        print(f"Data ingestion failed: {e}")


Data saved to BTC_2Y_Binance.csv


In [17]:
df = pd.read_csv("/Users/mac/Desktop/Comapny2/CryptoPortfolioAI/Experiments/BTC_2Y_Binance.csv")
df.head()

Unnamed: 0,Open Time,Open,High,Low,Close,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,Ignore
0,2023-01-22,22783.35,23078.71,22292.37,22707.88,253577.75286,1674431999999,5775552000.0,6603827,125842.91663,2866410000.0,0
1,2023-01-23,22706.02,23180.0,22500.0,22916.45,293588.37938,1674518399999,6704191000.0,7258655,145854.70936,3330928000.0,0
2,2023-01-24,22917.81,23162.2,22462.93,22632.89,293158.78254,1674604799999,6721433000.0,7395584,144239.84356,3307407000.0,0
3,2023-01-25,22631.94,23816.73,22300.0,23060.94,346042.83223,1674691199999,7875590000.0,8060446,172647.90115,3929822000.0,0
4,2023-01-26,23060.42,23282.47,22850.01,23009.65,288924.43581,1674777599999,6664592000.0,7181086,143185.08341,3303064000.0,0


In [18]:
df.columns

Index(['Open Time', 'Open', 'High', 'Low', 'Close', 'Volume', 'Close Time',
       'Quote Asset Volume', 'Number of Trades', 'Taker Buy Base Asset Volume',
       'Taker Buy Quote Asset Volume', 'Ignore'],
      dtype='object')

In [30]:
import pandas as pd
import numpy as np

class FeatureEngineering:
    def __init__(self, df):
        self.df = df

    def add_features(self):
        self.df = self.df.rename(columns={"Open Time": "ds", "Close": "y"})
        self.df = self.df.drop(columns=["Ignore"], errors='ignore')
        self.df['High_Low_Diff'] = self.df['High'] - self.df['Low']
        self.df['Open_Close_Diff'] = self.df['Open'] - self.df['y']
        self.df['Average_Price'] = (self.df['High'] + self.df['Low'] + self.df['y']) / 3
        self.df['Volume_Weighted_Price'] = self.df['Quote Asset Volume'] / self.df['Volume']

        return self.df



In [31]:
feature_engineering = FeatureEngineering(df)
engineered_data = feature_engineering.add_features()
engineered_data.head(10)


Unnamed: 0,ds,Open,High,Low,y,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,High_Low_Diff,Open_Close_Diff,Average_Price,Volume_Weighted_Price
0,2023-01-22,22783.35,23078.71,22292.37,22707.88,253577.75286,1674431999999,5775552000.0,6603827,125842.91663,2866410000.0,786.34,75.47,22692.986667,22776.257997
1,2023-01-23,22706.02,23180.0,22500.0,22916.45,293588.37938,1674518399999,6704191000.0,7258655,145854.70936,3330928000.0,680.0,-210.43,22865.483333,22835.342837
2,2023-01-24,22917.81,23162.2,22462.93,22632.89,293158.78254,1674604799999,6721433000.0,7395584,144239.84356,3307407000.0,699.27,284.92,22752.673333,22927.619307
3,2023-01-25,22631.94,23816.73,22300.0,23060.94,346042.83223,1674691199999,7875590000.0,8060446,172647.90115,3929822000.0,1516.73,-429.0,23059.223333,22759.004451
4,2023-01-26,23060.42,23282.47,22850.01,23009.65,288924.43581,1674777599999,6664592000.0,7181086,143185.08341,3303064000.0,432.46,50.77,23047.376667,23066.904845
5,2023-01-27,23009.65,23500.0,22534.88,23074.16,280833.86315,1674863999999,6461708000.0,7079096,139699.27638,3214624000.0,965.12,-64.51,23036.346667,23009.004977
6,2023-01-28,23074.16,23189.0,22878.46,23022.6,148115.71085,1674950399999,3408985000.0,4181816,73861.83566,1700058000.0,310.54,51.56,23030.02,23015.690852
7,2023-01-29,23021.4,23960.54,22967.76,23742.3,295688.79204,1675036799999,6941923000.0,7030837,149507.5061,3510027000.0,992.78,-720.9,23556.866667,23477.125752
8,2023-01-30,23743.37,23800.51,22500.0,22826.15,302405.90121,1675123199999,7020727000.0,7790224,149096.53066,3461493000.0,1300.51,917.22,23042.22,23216.23755
9,2023-01-31,22827.38,23320.0,22714.77,23125.13,264649.34909,1675209599999,6083859000.0,6798411,133028.12717,3058369000.0,605.23,-297.75,23053.3,22988.376678


In [32]:
engineered_data.tail(10)

Unnamed: 0,ds,Open,High,Low,y,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,High_Low_Diff,Open_Close_Diff,Average_Price,Volume_Weighted_Price
720,2025-01-11,94726.1,95050.94,93831.73,94599.99,7047.9043,1736639999999,665686000.0,1496191,3412.37671,322347000.0,1219.21,126.11,94494.22,94451.622298
721,2025-01-12,94599.99,95450.1,93711.19,94545.06,8606.86622,1736726399999,813278800.0,1658863,4028.33031,380691800.0,1738.91,54.93,94568.783333,94491.86369
722,2025-01-13,94545.07,95940.0,89256.69,94536.1,42619.56423,1736812799999,3930666000.0,5740963,19473.88042,1797888000.0,6683.31,8.97,93244.263333,92226.802535
723,2025-01-14,94536.11,97371.0,94346.22,96560.86,27846.61753,1736899199999,2676048000.0,4532923,14248.69616,1369464000.0,3024.78,-2024.75,96092.693333,96099.587947
724,2025-01-15,96560.85,100681.94,96500.0,100497.35,30509.99179,1736985599999,3006242000.0,3487945,15494.17873,1527728000.0,4181.94,-3936.5,99226.43,98533.044636
725,2025-01-16,100497.35,100866.66,97335.13,99987.3,27832.85317,1737071999999,2765162000.0,3762755,13513.53707,1342463000.0,3531.53,510.05,99396.363333,99348.853089
726,2025-01-17,99987.3,105865.22,99950.77,104077.48,39171.85292,1737158399999,4040994000.0,3886907,20673.213,2131223000.0,5914.45,-4090.18,103297.823333,103160.654507
727,2025-01-18,104077.47,104988.88,102277.55,104556.23,24307.82998,1737244799999,2521522000.0,3403162,11607.51274,1204171000.0,2711.33,-478.76,103940.886667,103732.917396
728,2025-01-19,104556.23,106422.43,99651.6,101331.57,43397.28298,1737331199999,4513424000.0,5271920,21079.51082,2193271000.0,6770.83,3224.66,102468.533333,104002.455027
729,2025-01-20,101331.57,109588.0,99550.0,102260.01,89529.231732,1737417599999,9398787000.0,11597937,44770.529492,4704453000.0,10038.0,-928.44,103799.336667,104980.092898


In [33]:
from prophet import Prophet

def Prophet_Features(df):
    df.reset_index(inplace=True)
    pd.to_datetime(df["ds"])
    prophet_model = Prophet(
                        growth='linear',
                        seasonality_mode='additive',
                        interval_width=0.95,
                        daily_seasonality=True,
                        weekly_seasonality=True,
                        yearly_seasonality=False
                    )
    prophet_model.fit(df)
    prophet_results = prophet_model.predict(df.drop('y', axis=1))
        
    return prophet_results

In [34]:
prophet_data = Prophet_Features(engineered_data)

INFO:cmdstanpy:start chain 1


INFO:cmdstanpy:finish chain 1


In [35]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

def Triple_ETS(df):
    df['Triple_Multiplicative_ETS'] = ExponentialSmoothing(
        df['y'], trend='mul', seasonal='mul', seasonal_periods=24*7
    ).fit().fittedvalues

    df['Triple_Additive_ETS'] = ExponentialSmoothing(
        df['y'], trend='add', seasonal='add', seasonal_periods=24*7
    ).fit().fittedvalues

    if 'index' in df.columns:
        df.drop(columns=['index'], inplace=True)

    return df


In [36]:
data = Triple_ETS(engineered_data)


Optimization failed to converge. Check mle_retvals.


Optimization failed to converge. Check mle_retvals.



In [37]:
data.head()

Unnamed: 0,ds,Open,High,Low,y,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,Taker Buy Quote Asset Volume,High_Low_Diff,Open_Close_Diff,Average_Price,Volume_Weighted_Price,Triple_Multiplicative_ETS,Triple_Additive_ETS
0,2023-01-22,22783.35,23078.71,22292.37,22707.88,253577.75286,1674431999999,5775552000.0,6603827,125842.91663,2866410000.0,786.34,75.47,22692.986667,22776.257997,28109.155802,29453.518025
1,2023-01-23,22706.02,23180.0,22500.0,22916.45,293588.37938,1674518399999,6704191000.0,7258655,145854.70936,3330928000.0,680.0,-210.43,22865.483333,22835.342837,23085.344679,23581.252439
2,2023-01-24,22917.81,23162.2,22462.93,22632.89,293158.78254,1674604799999,6721433000.0,7395584,144239.84356,3307407000.0,699.27,284.92,22752.673333,22927.619307,22559.067201,22350.628888
3,2023-01-25,22631.94,23816.73,22300.0,23060.94,346042.83223,1674691199999,7875590000.0,8060446,172647.90115,3929822000.0,1516.73,-429.0,23059.223333,22759.004451,22837.224817,23616.119619
4,2023-01-26,23060.42,23282.47,22850.01,23009.65,288924.43581,1674777599999,6664592000.0,7181086,143185.08341,3303064000.0,432.46,50.77,23047.376667,23066.904845,23026.628521,23012.917588


In [38]:
def Merging_DFs(df1, df2):
    df1['ds'] = df1['ds'].astype(str)
    df2['ds'] = df2['ds'].astype(str)
    featured_df = pd.merge(df1, df2, how='left', on='ds')
    return featured_df

In [39]:
final_df = Merging_DFs(data, prophet_data)
final_df

Unnamed: 0,ds,Open,High,Low,y,Volume,Close Time,Quote Asset Volume,Number of Trades,Taker Buy Base Asset Volume,...,daily,daily_lower,daily_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
0,2023-01-22,22783.35,23078.71,22292.37,22707.88,253577.752860,1674431999999,5.775552e+09,6603827,125842.916630,...,456.999148,456.999148,456.999148,-76.081930,-76.081930,-76.081930,0.0,0.0,0.0,23093.325311
1,2023-01-23,22706.02,23180.00,22500.00,22916.45,293588.379380,1674518399999,6.704191e+09,7258655,145854.709360,...,456.999148,456.999148,456.999148,112.910588,112.910588,112.910588,0.0,0.0,0.0,23328.419327
2,2023-01-24,22917.81,23162.20,22462.93,22632.89,293158.782540,1674604799999,6.721433e+09,7395584,144239.843560,...,456.999148,456.999148,456.999148,-44.578598,-44.578598,-44.578598,0.0,0.0,0.0,23217.031638
3,2023-01-25,22631.94,23816.73,22300.00,23060.94,346042.832230,1674691199999,7.875590e+09,8060446,172647.901150,...,456.999148,456.999148,456.999148,144.425392,144.425392,144.425392,0.0,0.0,0.0,23452.137126
4,2023-01-26,23060.42,23282.47,22850.01,23009.65,288924.435810,1674777599999,6.664592e+09,7181086,143185.083410,...,456.999148,456.999148,456.999148,-78.566095,-78.566095,-78.566095,0.0,0.0,0.0,23275.247136
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
725,2025-01-16,100497.35,100866.66,97335.13,99987.30,27832.853170,1737071999999,2.765162e+09,3762755,13513.537070,...,456.999148,456.999148,456.999148,-78.566095,-78.566095,-78.566095,0.0,0.0,0.0,103113.173504
726,2025-01-17,99987.30,105865.22,99950.77,104077.48,39171.852920,1737158399999,4.040994e+09,3886907,20673.213000,...,456.999148,456.999148,456.999148,24.418437,24.418437,24.418437,0.0,0.0,0.0,103539.521337
727,2025-01-18,104077.47,104988.88,102277.55,104556.23,24307.829980,1737244799999,2.521522e+09,3403162,11607.512740,...,456.999148,456.999148,456.999148,-82.527795,-82.527795,-82.527795,0.0,0.0,0.0,103755.938406
728,2025-01-19,104556.23,106422.43,99651.60,101331.57,43397.282980,1737331199999,4.513424e+09,5271920,21079.510820,...,456.999148,456.999148,456.999148,-76.081930,-76.081930,-76.081930,0.0,0.0,0.0,104085.747572


In [47]:
import xgboost as xgb
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from datetime import timedelta

class XGBoostForecasting:
    def __init__(self, data, date_column, target_column):
        self.data = data
        self.date_column = date_column
        self.target_column = target_column

    def preprocess_data(self):
        self.data = self.data.rename(columns={self.date_column: 'ds', self.target_column: 'y'})
        self.data['ds'] = pd.to_datetime(self.data['ds'])
        self.data['day'] = self.data['ds'].dt.day
        self.data['month'] = self.data['ds'].dt.month
        self.data['weekday'] = self.data['ds'].dt.weekday

    def prepare_features(self, training_period=730):
        last_date = self.data['ds'].max()
        start_date = last_date - timedelta(days=training_period)
        training_data = self.data[self.data['ds'] >= start_date]

        features = training_data[['day', 'month', 'weekday']]
        target = training_data['y']
        return features, target, training_data

    def train_model(self, training_period=730):
        features, target, _ = self.prepare_features(training_period)
        X_train = features
        y_train = target

        self.model = xgb.XGBRegressor(
            objective='reg:squarederror', 
            n_estimators=1000, 
            learning_rate=0.01, 
            max_depth=10, 
            subsample=0.8, 
            colsample_bytree=0.8
        )
        self.model.fit(X_train, y_train)

    def forecast(self, future_periods=180):
        last_date = self.data['ds'].max()
        forecast_dates = [last_date + timedelta(days=i) for i in range(1, future_periods + 1)]
        
        future_features = pd.DataFrame({
            'day': [d.day for d in forecast_dates],
            'month': [d.month for d in forecast_dates],
            'weekday': [d.weekday() for d in forecast_dates]
        })
        forecast_values = self.model.predict(future_features)

        last_actual_value = self.data['y'].iloc[-1]
        first_forecast_value = forecast_values[0]
        adjustment_factor = last_actual_value - first_forecast_value

        forecast_values += adjustment_factor

        return pd.DataFrame({'ds': forecast_dates, 'yhat': forecast_values})

    def evaluate_model(self, training_period=730):
        features, target, _ = self.prepare_features(training_period)
        X_test = features
        y_test = target

        y_pred = self.model.predict(X_test)

        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'MAPE': mape}

    def plot_forecast(self, forecast, training_period=730):
        features, _, training_data = self.prepare_features(training_period)
        training_data['y_pred'] = self.model.predict(features)

        fig = go.Figure()

        # Plot actual data
        fig.add_trace(go.Scatter(x=self.data['ds'], y=self.data['y'], mode='lines', name='Actual', line=dict(color='blue')))

        # Plot predicted values during the training period
        fig.add_trace(go.Scatter(x=training_data['ds'], y=training_data['y_pred'], mode='lines', name='Predicted (Train)', line=dict(color='orange', dash='dot')))

        # Plot forecast
        fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines', name='Forecast', line=dict(color='green')))

        # Layout and styling
        fig.update_layout(
            title='XGBoost Forecast with Training Predictions',
            xaxis_title='Date',
            yaxis_title='Value',
            showlegend=True,
            template='plotly_dark'
        )
        fig.show()


if __name__ == "__main__":
    # Assuming `final_df` is your input DataFrame
    xgboost_forecasting = XGBoostForecasting(data=final_df, date_column='ds', target_column='y')
    xgboost_forecasting.preprocess_data()
    xgboost_forecasting.train_model(training_period=730)
    forecast = xgboost_forecasting.forecast(future_periods=180)
    evaluation_metrics = xgboost_forecasting.evaluate_model(training_period=730)

    # Plot the forecast
    xgboost_forecasting.plot_forecast(forecast)

    # Print evaluation metrics
    print("Evaluation Metrics:")
    print(f"MAE: {evaluation_metrics['MAE']}")
    print(f"MSE: {evaluation_metrics['MSE']}")
    print(f"RMSE: {evaluation_metrics['RMSE']}")
    print(f"MAPE: {evaluation_metrics['MAPE']}")


Evaluation Metrics:
MAE: 18202.110053403252
MSE: 365198714.5208942
RMSE: 19110.173063603957
MAPE: 0.43386657123939365
