In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('datasets/diamonds_processed.csv', index_col=0)
data.head()

Unnamed: 0_level_0,table,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,x,y,z
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,55.0,326,0,0,1,0,0,0,1,0,0,0,0,0,-1.198168,-0.174092,-1.587837,-1.536196,-1.571129
2,61.0,326,0,0,0,1,0,0,1,0,0,0,0,0,-1.240361,-1.360738,-1.641325,-1.658774,-1.741175
4,65.0,327,0,1,0,0,0,0,1,0,0,0,0,0,-1.198168,-3.385019,-1.498691,-1.457395,-1.741175
3,58.0,334,0,0,0,1,0,0,0,0,0,0,1,0,-1.071587,0.454133,-1.364971,-1.317305,-1.28772
1,58.0,335,0,1,0,0,0,0,0,0,0,0,0,1,-1.029394,1.082358,-1.240167,-1.212238,-1.117674


In [3]:
from sklearn.linear_model import LinearRegression

def linear_model(x_train, y_train):
    print("Linear Regression")
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    
    return linear_regression

In [10]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    print("Lasso Regression")
    lasso_regression = Lasso(alpha=0.8, max_iter=10000)
    lasso_regression.fit(x_train, y_train)
    
    return lasso_regression

In [11]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    print("Ridge Regression")
    ridge_regression = Ridge(alpha=0.9)
    ridge_regression.fit(x_train, y_train)
    
    return ridge_regression

In [7]:
def build_and_train_model(data, target_name, reg_fn):
    X = data.drop(target_name, axis=1)
    Y = data[target_name]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    
    model = reg_fn(x_train, y_train)
    
    score = model.score(x_train, y_train)
    print("Training score: ", score)
    
    y_pred = model.predict(x_test)
    r_score = r2_score(y_test, y_pred)
    print("Testing score: ", r_score)
    
    return {
        "model": model,
        "x_train": x_train, "x_test": x_test,
        "y_train": y_train, "y_test": y_test,
        "y_pred": y_pred
    }

In [8]:
linear_reg = build_and_train_model(data, "price", linear_model)

Linear Regression
Training score:  0.8776870525541944
Testing score:  0.8801752378301545


In [12]:
lasso_reg = build_and_train_model(data, "price", lasso_model)

Lasso Regression
Training score:  0.8776814115187282
Testing score:  0.8800849828465309


In [13]:
ridge_reg = build_and_train_model(data, "price", ridge_model)

Ridge Regression
Training score:  0.8776928171798004
Testing score:  0.8801498750599586


In [16]:
from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    print("SGD Regression")
    sgd_regression = SGDRegressor(max_iter=2000)
    sgd_regression.fit(x_train, y_train)
    
    return sgd_regression

In [17]:
sgd_reg = build_and_train_model(data, "price", sgd_model)

SGD Regression
Training score:  -14787.255845237187
Testing score:  -15177.017452493312
