In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('datasets/diamonds_processed.csv', index_col= 0)
data.head()

Unnamed: 0_level_0,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,table,x,y,z
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
5,1071,0,0,1,0,0,0,1,0,0,0,0,0,-0.969107,0.38053,-0.203029,-1.12099,-1.063302,-1.095087
2,6694,0,0,0,1,0,0,0,0,0,0,1,0,1.238266,0.24024,-0.203029,1.320195,1.089177,1.285672
2,5280,0,0,0,0,1,0,0,0,0,0,1,0,0.838836,0.590965,0.346533,0.865814,0.811964,0.955808
1,1208,0,0,0,1,0,0,0,0,1,0,0,0,-0.50661,-1.232802,1.170875,-0.230046,-0.264276,-0.621804
4,7114,0,0,0,1,0,0,0,1,0,0,0,0,0.418384,0.590965,0.254939,0.562893,0.534751,0.654627


In [3]:
from sklearn.linear_model import LinearRegression
def linear_model(x_train, y_train):
    print("Linear Regression")
    linear_regression = LinearRegression()
    linear_regression.fit(x_train, y_train)
    return linear_regression


In [4]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    print("Lasso Regression")
    lasso_regression = Lasso(alpha=0.8, max_iter=10000) # alpha determines the strength of regularization
    lasso_regression.fit(x_train, y_train)
    return lasso_regression

In [5]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    print("Ridge Regression")
    ridge_regression = Ridge(alpha= 0.9)
    ridge_regression.fit(x_train, y_train)
    return ridge_regression

In [10]:
from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    print("SGD Regression")
    sgd_regression = SGDRegressor(max_iter=2000)
    sgd_regression.fit(x_train, y_train)
    return sgd_regression

In [6]:
def build_and_train_model(data, target_name, reg_fn):
    X= data.drop(target_name, axis= 1)
    Y= data[target_name]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size= 0.2, random_state= 0)

    model= reg_fn(x_train, y_train)

    score= model.score(x_train, y_train)
    print("Training Score:", score)

    y_pred= model.predict(x_test)
    r_score= r2_score(y_test, y_pred)
    print("Testing Score:", r_score)

    return{
        'model': model,
        'x_train': x_train,
        'x_test': x_test,
        'y_train': y_train,
        'y_test': y_test,
        'y_pred': y_pred
        }

In [7]:
linear_reg = build_and_train_model(data, "price", linear_model)

Linear Regression
Training Score: 0.8847335111564048
Testing Score: 0.8633595029664004


In [8]:
lasso_reg = build_and_train_model(data, "price", lasso_model)

Lasso Regression
Training Score: 0.8847081608094955
Testing Score: 0.8633648261188671


In [9]:
ridge_reg = build_and_train_model(data, "price", ridge_model)

Ridge Regression
Training Score: 0.8847327876834433
Testing Score: 0.8633537873107049


In [12]:
sgd_reg = build_and_train_model(data, "price", sgd_model)

SGD Regression
Training Score: 0.8843951639226753
Testing Score: 0.8632245778376872
