In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Read the dataset
dataset = pd.read_csv("LINEAR_INTERPOLATION_MID_CSV.csv")

# Convert 'snapped_at' column to datetime
dataset['snapped_at'] = pd.to_datetime(dataset['snapped_at'])

# Convert 'snapped_at' to days since the minimum date
dataset['snapped_at'] = (dataset['snapped_at'] - dataset['snapped_at'].min()) / np.timedelta64(1, 'D')

# Split features and target variable
X = dataset[['snapped_at', 'market_cap', 'total_volume']]
y = dataset['price']

# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=100)

# Split the remaining data (30%) into validation and testing sets (50% each)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=100)

# Fit the Multiple Linear Regression model
mlr = LinearRegression()
mlr.fit(X_train, y_train)

# Prediction on validation set
y_pred_val = mlr.predict(X_val)

# Prediction on test set
y_pred_test = mlr.predict(X_test)

# Print metrics for validation set
print("Metrics for Validation Set:")
print('R squared: {:.2f}'.format(mlr.score(X_val, y_val) * 100))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_val, y_pred_val))
print('Mean Square Error:', metrics.mean_squared_error(y_val, y_pred_val))
print('Root Mean Square Error:', np.sqrt(metrics.mean_squared_error(y_val, y_pred_val)))


Metrics for Validation Set:
R squared: 99.92
Mean Absolute Error: 288.10768077123225
Mean Square Error: 194785.0048913152
Root Mean Square Error: 441.3445421564826


In [2]:
dataset = pd.read_csv("LINEAR_INTERPOLATION_MID_CSV.csv")
dataset.head()

Unnamed: 0,snapped_at,price,market_cap,total_volume
0,2017-01-01 00:00:00,998.05,16045990000.0,3470830000.0
1,2017-01-02 00:00:00,976.67275,15703800000.0,3649608000.0
2,2017-01-03 00:00:00,955.2955,15361610000.0,3828385000.0
3,2017-01-04 00:00:00,933.91825,15019420000.0,4007162000.0
4,2017-01-05 00:00:00,912.541,14677230000.0,4185940000.0


In [3]:
dataset['snapped_at'] = pd.to_datetime(dataset['snapped_at'])    
dataset['snapped_at'] = (dataset['snapped_at'] - dataset['snapped_at'].min())  / np.timedelta64(1,'D')
dataset.head()

Unnamed: 0,snapped_at,price,market_cap,total_volume
0,0.0,998.05,16045990000.0,3470830000.0
1,1.0,976.67275,15703800000.0,3649608000.0
2,2.0,955.2955,15361610000.0,3828385000.0
3,3.0,933.91825,15019420000.0,4007162000.0
4,4.0,912.541,14677230000.0,4185940000.0


In [4]:
X = dataset[['snapped_at', 'market_cap', 'total_volume']]
y = dataset['price']

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, 
                                                    test_size = 0.3, random_state = 100)

In [6]:
from sklearn.linear_model import LinearRegression
#Fitting the Multiple Linear Regression model
mlr = LinearRegression()  
mlr.fit(X_train, y_train)

In [7]:
print("Intercept: ", mlr.intercept_)
print("Coefficients:")
list(zip(X, mlr.coef_))

Intercept:  1095.4733924811371
Coefficients:


[('snapped_at', -0.9118952135858756),
 ('market_cap', 5.3071712488063026e-08),
 ('total_volume', 8.888853881113282e-09)]

In [8]:
#Prediction of test set
y_pred_mlr= mlr.predict(X_test)
#Predicted values
print("Prediction for test set: {}".format(y_pred_mlr))

Prediction for test set: [44465.73123726 31521.77810164 26709.8744442  26244.70502325
 24685.06961506  7303.72732352  9030.34561839  4397.84036549
 20291.55540258 61444.68508946  6611.76336601 18701.56848005
  6661.10965645 23536.53624056 23027.20383744 10685.68863566
  8323.35199349  1857.94752953 35211.36111302 10851.43722677
 28465.37520044  7440.18524467 26159.82578716 42482.51698451
  4291.35490351 43379.68533695 26331.350184   52965.22089189
 34952.19303713 26486.52241971  6802.64443234 57167.5080282
 27729.16598992 52749.96474178  7043.12155731  9256.51756875
  9018.55870369 11199.74422651 23438.32727305  1856.01236446
  6977.62315236  1803.079132    2445.52573469 51148.37317651
  7188.33373381 11692.79772681 23024.2316543   9193.43048725
 46590.60365092 45733.86658099 23648.00691545  6941.20343492
 12090.59411934  7073.6425682   8668.30653495  8341.71903513
 12406.73742637  7326.18940558  9404.85412781 58466.78546538
 45693.43529849  7667.61511324  8149.33470978  9880.79363791


In [9]:
#Actual value and the predicted value
mlr_diff = pd.DataFrame({'Actual value': y_test, 'Predicted value': y_pred_mlr})
mlr_diff.head()
mlr_diff.tail()

Unnamed: 0,Actual value,Predicted value
2104,20161.830615,19943.138119
2203,18866.810331,18798.715009
1371,11619.53308,11432.942239
2227,22946.286579,22809.090349
2383,30620.814247,30591.259654


In [10]:
from sklearn import metrics
meanAbErr = metrics.mean_absolute_error(y_test, y_pred_mlr)
meanSqErr = metrics.mean_squared_error(y_test, y_pred_mlr)
rootMeanSqErr = np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr))
print('R squared: {:.2f}'.format(mlr.score(X,y)*100))
print('Mean Absolute Error:', meanAbErr)
print('Mean Square Error:', meanSqErr)
print('Root Mean Square Error:', rootMeanSqErr)

R squared: 99.93
Mean Absolute Error: 302.25499503427136
Mean Square Error: 216148.04081534926
Root Mean Square Error: 464.91724082394416
