In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Read the dataset
dataset = pd.read_csv("linear_interpolation_removed.csv")

# Convert 'snapped_at' column to datetime
dataset['snapped_at'] = pd.to_datetime(dataset['snapped_at'])

# Convert 'snapped_at' to days since the minimum date
dataset['snapped_at'] = (dataset['snapped_at'] - dataset['snapped_at'].min()) / np.timedelta64(1, 'D')

# Split features and target variable
X = dataset[['snapped_at', 'market_cap', 'total_volume']]
y = dataset['price']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

# Split the remaining data (30%) into validation and testing sets (50% each)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=100)

# Fit the Multiple Linear Regression model
mlr = LinearRegression()
mlr.fit(X_train, y_train)

# Prediction on validation set
y_pred_val = mlr.predict(X_val)

# Prediction on test set
y_pred_test = mlr.predict(X_test)

# Print metrics for validation set
print("Metrics for Validation Set:")
print('R squared: {:.2f}'.format(mlr.score(X_val, y_val) * 100))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred_val))
print('Mean Square Error:', metrics.mean_squared_error(y_val, y_pred_val))
print('Root Mean Square Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred_val)))


Metrics for Validation Set:
R squared: 99.93
Mean Absolute Error: 273.15358749312907
Mean Square Error: 166681.55864999047
Root Mean Square Error: 408.26652893666227


In [2]:
dataset = pd.read_csv("linear_interpolation_removed.csv")
dataset.head()

Unnamed: 0,snapped_at,price,market_cap,total_volume
0,2017-01-01 00:00:00,1019.1988,16387930000.0,4444828000.0
1,2017-01-02 00:00:00,1019.1988,16387930000.0,4444828000.0
2,2017-01-03 00:00:00,1035.5313,16652720000.0,4399421000.0
3,2017-01-04 00:00:00,1130.849516,18187660000.0,8705359000.0
4,2017-01-05 00:00:00,990.6675,15934890000.0,12102750000.0


In [3]:
dataset['snapped_at'] = pd.to_datetime(dataset['snapped_at'])    
dataset['snapped_at'] = (dataset['snapped_at'] - dataset['snapped_at'].min())  / np.timedelta64(1,'D')
dataset.head()

Unnamed: 0,snapped_at,price,market_cap,total_volume
0,0.0,1019.1988,16387930000.0,4444828000.0
1,1.0,1019.1988,16387930000.0,4444828000.0
2,2.0,1035.5313,16652720000.0,4399421000.0
3,3.0,1130.849516,18187660000.0,8705359000.0
4,4.0,990.6675,15934890000.0,12102750000.0


In [4]:
X = dataset[['snapped_at', 'market_cap', 'total_volume']]
y = dataset['price']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, 
                                                    test_size = 0.3, random_state = 100)

In [6]:
from sklearn.linear_model import LinearRegression
#Fitting the Multiple Linear Regression model
mlr = LinearRegression()  
mlr.fit(X_train, y_train)

In [7]:
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(X, mlr.coef_))

Intercept:  948.8481710653359
Coefficients:


[('snapped_at', -0.8114347188066674),
 ('market_cap', 5.286848215406634e-08),
 ('total_volume', 1.1404627092772172e-08)]

In [8]:
#Prediction of test set
y_pred_mlr= mlr.predict(X_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred_mlr))

Prediction for test set: [60872.97262248 24869.57077836 22857.43608565 17453.36600433
 20557.85357969  7121.90669847  9061.87005652  4286.93645381
 30375.88043525 57932.98980883  6815.89800078 21555.37764351
  8319.16194332 21272.06396967 26862.62932329  9851.90164945
  8761.29839701  1764.94689495 47350.97864791  9822.42588587
 21132.74920618  6579.02985029 23971.29636197 46479.81267385
  4186.3655762  56958.72351407 27205.08426097 57105.361648
 34962.14043307 27169.15328431  7986.12558712 55331.53575938
 21923.2306926  55669.53969116  7199.08292731  8765.33867884
  8998.32761845  9876.67763976 21338.30001329  1958.46772047
  8994.26572234  1749.73507422  2632.397267   51158.18715061
  7176.77772697 10337.79642629 26563.62922865  8447.62648847
 47058.2570425  47570.30767835 26638.37925231  9097.72091266
 12477.34239102  7213.8208091   9237.81321059 10733.84460303
 13798.13665741  8540.23017017  9761.05125058 58373.0203595
 55320.04218548  6944.787183    9053.08207143  9844.11102478
 1

In [9]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()
mlr_diff.tail()

Unnamed: 0,Actual value,Predicted value
2104,22377.516769,22302.87788
2203,20038.3778,19881.107054
1371,10572.674843,10423.068081
2227,21630.122113,21470.772101
2383,24862.561486,24797.708825


In [10]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 99.94
Mean Absolute Error: 286.9809253573287
Mean Square Error: 187167.39868000525
Root Mean Square Error: 432.62847650149575
