In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Read the dataset
dataset = pd.read_csv("linear_interpolation_comp.csv")

# Convert 'snapped_at' column to datetime
dataset['snapped_at'] = pd.to_datetime(dataset['snapped_at'])

# Convert 'snapped_at' to days since the minimum date
dataset['snapped_at'] = (dataset['snapped_at'] - dataset['snapped_at'].min()) / np.timedelta64(1, 'D')

# Split features and target variable
X = dataset[['snapped_at', 'market_cap', 'total_volume']]
y = dataset['price']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

# Split the remaining data (30%) into validation and testing sets (50% each)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=100)

# Fit the Multiple Linear Regression model
mlr = LinearRegression()
mlr.fit(X_train, y_train)

# Prediction on validation set
y_pred_val = mlr.predict(X_val)

# Prediction on test set
y_pred_test = mlr.predict(X_test)

# Print metrics for validation set
print("Metrics for Validation Set:")
print('R squared: {:.2f}'.format(mlr.score(X_val, y_val) * 100))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred_val))
print('Mean Square Error:', metrics.mean_squared_error(y_val, y_pred_val))
print('Root Mean Square Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred_val)))


Metrics for Validation Set:
R squared: 99.93
Mean Absolute Error: 278.1223797873731
Mean Square Error: 171251.2735506616
Root Mean Square Error: 413.82517268849364
