## Step 1. General Imports

In [60]:
from sklearn.model_selection import train_test_split
from sklearn import linear_model
import pandas as pd

from sklearn.metrics import max_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import d2_absolute_error_score

## Step 2. Data Reading

In [41]:
data = pd.read_csv('./data/diamond.csv')

## Step 3. Reducing & Filtering data

### 3.1 Reducing data by dropping missing elements in the "Price" column

In [45]:
data['Price'].dropna()

0        5169
1        3470
2        3183
3        4370
4        3171
        ...  
5995     6250
5996     5328
5997     6157
5998    11206
5999    30507
Name: Price, Length: 6000, dtype: int64

### 3.2 Filtering data by replacing NaN values to "missing"

In [46]:
data.fillna(value="missing")

Unnamed: 0,Carat Weight,Cut,Color,Clarity,Polish,Symmetry,Report,Price
0,1.10,Ideal,H,SI1,VG,EX,GIA,5169
1,0.83,Ideal,H,VS1,ID,ID,AGSL,3470
2,0.85,Ideal,H,SI1,EX,EX,GIA,3183
3,0.91,Ideal,E,SI1,VG,VG,GIA,4370
4,0.83,Ideal,G,SI1,EX,EX,GIA,3171
...,...,...,...,...,...,...,...,...
5995,1.03,Ideal,D,SI1,EX,EX,GIA,6250
5996,1.00,Very Good,D,SI1,VG,VG,GIA,5328
5997,1.02,Ideal,D,SI1,EX,EX,GIA,6157
5998,1.27,Signature-Ideal,G,VS1,EX,EX,GIA,11206


## Step 4. Initialising X & Y variables

In [47]:
X = data.drop('Price', axis=1)
y = data['Price']

## Step 5. Splitting our data on train & test

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.5)

In [52]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3000, 7), (3000, 7), (3000,), (3000,))

## Step 6. Bring the variables X and Y to the correct format 

In [53]:
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

## Step 7. Initialising & fitting model

### Step 7.1. Initialising the regression model.

In [56]:
model = linear_model.LassoLars(alpha=0.1)

### Step 7.2. Fitting the regression model.

In [57]:
model.fit(X_train, y_train)

# ———————————————————

## Scoring

In [61]:
y_preds = model.predict(X_test)

### R2 score

In [63]:
r2_score_result = r2_score(y_test, y_preds)
r2_score_result

0.8659302391430095

### Max error score

In [65]:
max_error_result = max_error(y_test, y_preds)
max_error_result

45068.58627998915

### Explained variance score

In [66]:
explained_variance_score_result = explained_variance_score(y_test, y_preds)
explained_variance_score_result

0.8659348634749475