In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set()

In [2]:
data = pd.read_csv('Abalone.csv')

In [3]:
data.head()

Unnamed: 0,Length,Height,Whole_height,Age
0,0.455,0.095,0.514,16.5
1,0.35,0.09,0.2255,8.5
2,0.53,0.135,0.677,10.5
3,0.44,0.125,0.516,11.5
4,0.33,0.08,0.205,8.5


In [4]:
from sklearn.model_selection import train_test_split

In [13]:
x = data.drop('Age', axis=1)
y = data.Age.values.reshape(-1, 1)

In [14]:
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.25, random_state=42)

In [15]:
from sklearn.preprocessing import StandardScaler

In [16]:
scl = StandardScaler()

In [17]:
train_x_scl = scl.fit_transform(train_x)

In [18]:
train_x_scl

array([[ 0.50137339,  0.27277904,  0.21395674],
       [ 0.54336237,  0.4032712 ,  0.08346344],
       [-1.80802061, -1.68460335, -1.38254722],
       ...,
       [-0.00249439, -0.24918959, -0.43035392],
       [ 0.4593844 ,  0.01179472, -0.26927625],
       [ 0.58535135,  0.53376336,  0.6268457 ]])

In [19]:
test_x_scl = scl.transform(test_x)

In [20]:
test_x_scl

array([[-0.08647236, -0.11869744, -0.41506174],
       [-0.17045032,  0.14228688, -0.37122414],
       [-1.01022996, -0.90165039, -0.95946347],
       ...,
       [-1.76603163, -1.55411119, -1.33157327],
       [-0.92625199, -0.51017391, -1.01349585],
       [-0.59034014, -1.16263471, -0.65056136]])

In [12]:
from sklearn.linear_model import LinearRegression

In [13]:
lin_reg = LinearRegression()

In [15]:
lin_reg.fit(train_x_scl, train_y)

LinearRegression()

In [28]:
from sklearn.metrics import mean_squared_error

In [29]:
def evaluation(model, model_name):
    train_pred = model.predict(train_x_scl)
    test_pred = model.predict(test_x_scl)
    train_rmse = mean_squared_error(train_y, train_pred, squared=False)
    test_rmse = mean_squared_error(test_y, test_pred, squared=False)
    
    print(f'Model: {model_name}')
    print(f'Train RMSE    : {train_rmse}')
    print(f'Test RMSE     : {test_rmse}')

In [54]:
evaluation(lin_reg, 'Linear Regression') # without normallization

Model: Linear Regression
Train RMSE    : 2.5678029408911955
Test RMSE     : 2.520637095978111


In [21]:
evaluation(lin_reg, 'Linear Reg with Normalization')

Model: Linear Reg with Normalization
Train RMSE    : 2.5678029408911955
Test RMSE     : 2.520637095978111


### SVM Model

In [21]:
from sklearn.svm import SVR

In [22]:
svr = SVR()

In [27]:
svr.fit(train_x_scl, train_y.ravel())

SVR()

In [30]:
evaluation(svr, 'SVR')

Model: SVR
Train RMSE    : 2.5787514464647225
Test RMSE     : 2.5262214685874986


In [31]:
import lightgbm as lgb

In [32]:
params = {
    'learning_rate': '0.01'
}

In [35]:
train_data = lgb.Dataset(train_x_scl, label=train_y.ravel())

In [40]:
lgb_reg = lgb.train(params, train_data, 100)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 424
[LightGBM] [Info] Number of data points in the train set: 3131, number of used features: 3
[LightGBM] [Info] Start training from score 11.479559


In [41]:
lgb_pred = lgb_reg.predict(test_x_scl)

In [43]:
print(mean_squared_error(test_y, lgb_pred, squared=False))

2.57945663942159
