In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

# Load your dataset
data = pd.read_csv('/content/combined_data.csv')

# Check for missing values and fill them if necessary
if data.isnull().sum().any():
    print("Warning: Dataset contains missing values. Filling missing values with default values.")
    data = data.fillna(0)

# Define target variable and features
target_column = 'price'  # Replace with your actual target column
features = data.drop(columns=[target_column])  # All columns except the target

# Identify object columns and handle them
object_columns = features.select_dtypes(include=['object']).columns
print(f"Object columns: {object_columns.tolist()}")

# Convert datetime columns
datetime_columns = ['tx_timestamp', 'created_date', 'event_timestamp']
for col in datetime_columns:
    if col in features.columns:
        features[col] = pd.to_datetime(features[col], errors='coerce').astype('int64') // 10**9  # Convert to Unix timestamp

# Convert other object columns to categorical
for col in object_columns:
    if col not in datetime_columns:
        features[col] = features[col].astype('category').cat.codes

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features,
    data[target_column],
    test_size=0.2,
    random_state=42
)

# Initialize and train the XGBoost model
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)

model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate performance metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Display results
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")


Object columns: ['tx_timestamp', 'token', 'created_date', 'chain', 'token_type', 'asset_contract_type', 'asset_type', 'event_timestamp']
R² Score: 0.2306
MSE: 72229.5567
RMSE: 268.7556


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler

# Load your dataset
data = pd.read_csv('/content/combined_data.csv')

# Separate numeric and non-numeric columns
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
non_numeric_columns = data.select_dtypes(exclude=['float64', 'int64']).columns

# Fill missing values
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())
data[non_numeric_columns] = data[non_numeric_columns].fillna('unknown')

# Convert datetime columns to numerical
datetime_columns = ['tx_timestamp', 'created_date', 'event_timestamp']
for col in datetime_columns:
    if col in data.columns:
        data[col] = pd.to_datetime(data[col], errors='coerce').astype('int64') // 10**9

# Convert categorical object columns to numerical
object_columns = data.select_dtypes(include=['object', 'category']).columns
for col in object_columns:
    data[col] = data[col].astype('category').cat.codes

# Define target variable and features
target_column = 'price'  # Replace with your actual target column
features = data.drop(columns=[target_column])

# Standardize numeric features
scaler = StandardScaler()
numerical_columns = features.select_dtypes(include=['float64', 'int64']).columns
features[numerical_columns] = scaler.fit_transform(features[numerical_columns])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    features,
    data[target_column],
    test_size=0.2,
    random_state=42
)

# Initialize XGBoost model
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0]
}

# Perform GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Use the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

# Display results
print(f"Best Parameters: {grid_search.best_params_}")
print(f"R² Score: {r2:.4f}")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best Parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
R² Score: 0.2526
MSE: 70159.9389
RMSE: 264.8772


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.api import VAR

# Load and preprocess data
file_path = '/content/combined_data.csv'  # Update with your actual file path
data = pd.read_csv(file_path)

# Parse timestamps
data['event_timestamp'] = pd.to_datetime(data['event_timestamp'], dayfirst=True, errors='coerce')

# Select relevant features for VAR
features = [
    'price', 'num_sales', 'eth_price', 'rarity_score',
    'DistanceToDistrict', 'DistanceToRoad', 'DistanceToPlaza',
    'sentiment_1_week_before', 'sentiment_2_weeks_before',
    'sentiment_3_weeks_before', 'sentiment_1_month_before',
    'Bitcoin_Price', 'Ether_Price', 'Gold_Price', 'Crude_Price'
]

# Drop rows with missing values in selected features
data_cleaned = data[features].dropna()

# Scale data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_cleaned)

# Convert to DataFrame for VAR compatibility
data_scaled_df = pd.DataFrame(data_scaled, columns=features)

# Split data into train and test sets
train_size = int(len(data_scaled_df) * 0.8)
train, test = data_scaled_df[:train_size], data_scaled_df[train_size:]

# VAR Model Implementation
def var_forecast(train, test, steps=10):
    model = VAR(train)
    model_fitted = model.fit(maxlags=15, ic='aic')  # Automatically determine optimal lag
    forecast = model_fitted.forecast(train.values[-model_fitted.k_ar:], steps=steps)
    return forecast, model_fitted

# Perform forecasting
forecast, model_fitted = var_forecast(train, test)

# Evaluate using RMSE
forecast_df = pd.DataFrame(forecast, columns=features)
target_test = test['price'].iloc[:len(forecast_df)]
target_forecast = forecast_df['price']

rmse = np.sqrt(mean_squared_error(target_test, target_forecast))
print(f'VAR RMSE: {rmse}')

# Optionally, view model summary
print(model_fitted.summary())


VAR RMSE: 0.0010754180709257836
  Summary of Regression Results   
Model:                         VAR
Method:                        OLS
Date:           Sat, 16, Nov, 2024
Time:                     11:53:17
--------------------------------------------------------------------
No. of Equations:         15.0000    BIC:                   -67.5632
Nobs:                     3999.00    HQIC:                  -67.8070
Log likelihood:           50972.8    FPE:                3.11628e-30
AIC:                     -67.9409    Det(Omega_mle):     2.93511e-30
--------------------------------------------------------------------
Results for equation price
                                 coefficient       std. error           t-stat            prob
----------------------------------------------------------------------------------------------
const                               0.000519         0.004159            0.125           0.901
L1.price                           -0.000960         0.015817      

In [10]:
from sklearn.metrics import mean_absolute_error

# Function to calculate MAE
def var_forecast_with_mae(train, test, steps=10):
    model = VAR(train)
    model_fitted = model.fit(maxlags=15, ic='aic')
    forecast = model_fitted.forecast(train.values[-model_fitted.k_ar:], steps=steps)

    return forecast

# Get forecasted values
var_forecast_result = var_forecast_with_mae(train, test)

# Calculate MAE
mae = mean_absolute_error(test.iloc[:len(var_forecast_result), 0], var_forecast_result[:, 0])
print(f'VAR MAE: {mae}')


VAR MAE: 0.0010273461208635429
