In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.metrics import mean_absolute_error
SEED = 24

In [7]:
train_raw = pd.read_csv("train.csv")
test_raw = pd.read_csv("test.csv")

In [8]:
df_train = train_raw.copy()

# Processing data

## Change name

In [10]:
df_train.columns = ["id", "sex", "length", "diameter", "height", "weight", "shucked_weight", "viscera_weight", "shell_weight", "age"]

In [12]:
df_train.head()

Unnamed: 0,id,sex,length,diameter,height,weight,shucked_weight,viscera_weight,shell_weight,age
0,0,I,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928,9
1,1,I,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194,8
2,2,M,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133,9
3,3,F,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885,11
4,4,I,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395,8


## Feature engineering
### Ratio Features 
- relative size differences between different components

In [15]:
df_train['shell_ratio'] = df_train['shell_weight'] / df_train['weight']
df_train['shell_to_body_ratio'] = df_train['shell_weight'] / (df_train['weight'] + df_train['shell_weight'])
df_train['meat_yield'] = df_train['shucked_weight'] / (df_train['weight'] + df_train['shell_weight'])
df_train['length_to_dinameter_ratio'] = df_train['length'] / df_train['diameter']
df_train['weight_to_viscera_weight'] = df_train['weight'] / df_train['viscera_weight']
df_train['weight_to_shellWeight_ratio'] = df_train['weight'] / df_train['shell_weight']
df_train['weight_to_shuckedWeight_ratio'] = df_train['weight'] / df_train['shucked_weight']

### Geometric Features
- capture physical properties of the crab

In [18]:
df_train["surface_area"] = 2*(df_train["length"] * df_train["diameter"] + df_train["length"] * df_train["height"] + df_train["diameter"] * df_train["height"])
df_train["volume"] = df_train["length"]*df_train["diameter"]*df_train["height"]
df_train["density"] = df_train["weight"] / df_train["volume"]
df_train["pseudo_BMI"] = df_train["weight"] / (df_train["height"]**2)

### Polynomial features
capture any non-linear relationship between predictors

In [20]:
df_train['length_poly'] = df_train['length']**2
df_train['diameter_poly'] = df_train['diameter']**2

### Logarithmic Transformations
- reduce skewness and manage extreme values

In [None]:
df_train['log_weight'] = l

In [21]:

math.log(1)

0.0

## One hot data

In [4]:
df_train = pd.get_dummies(train_raw, columns = ["Sex"])
y = df_train["Age"]
X = df_train.drop(["id","Age"], axis=1)

## Split train & test

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = SEED)

## Test with base linear regression model

In [6]:
def Evaluating_model(y_test, y_pred):
    r2_test = float(format(r2_score(y_test,y_pred),'.3f'))
    RMSE = float(format(mean_squared_error(y_test,y_pred,squared=False),'0.3f'))
    MAE = float(format(mean_absolute_error(y_test, y_pred),'0.3f'))
    return (r2_test, RMSE, MAE)

In [9]:
base_lr = LinearRegression()
base_lr = base_lr.fit(X_train,y_train)
y_pred = base_lr.predict(X_test)
print("MAE: ", mean_absolute_error(y_test, y_pred))

MAE:  1.4824897525481464


In [10]:
Evaluating_model(y_test, y_pred)

(0.555, 2.137, 1.482)

In [13]:
def test_model_linear(X, y, name_model, model, Details):
    
    Scalers = [['None',False],['MinMaxScaler',MinMaxScaler()],['StandarScaler',StandardScaler()]]
    
    df_res = pd.DataFrame(index = None, columns =['Model','Details','Scaler','Feature_selection','R_score (train)','R_score (test)',
                                                  'RMSE','MAE','5-Fold Cross Validation'])
    for scaler in Scalers:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = SEED)
        
        if scaler[0] != 'None':
            sc = scaler[1].fit(X_train)
            X_train = sc.transform(X_train)
            X_test = sc.transform(X_test)
            
        sum_cols = X.columns.size
        
        for num_cols in range(3,sum_cols+1):
            print('\r', end='')
            print(scaler[0]+': '+str(num_cols)+'/'+str(sum_cols), end='')
            
            kBest = SelectKBest(f_regression,k = num_cols)
            kBest.fit(X_train, y_train)
            
            X_train_KBest = kBest.transform(X_train)
            X_test_KBest = kBest.transform(X_test)
           
            mod = model
            mod = model.fit(X_train_KBest, y_train)
            
            y_pred = mod.predict(X_test_KBest)
        
            r2_train = r2_score(y_train, mod.predict(X_train_KBest))
            r2_test, RMSE, MAE = Evaluating_model(y_test, y_pred)
            fcv = float(format(cross_val_score(mod, X_test, y_test).mean(),'.3f'))
            df_res = df_res.append(pd.Series({'Model':name_model,
                                              'Details':Details,
                                              'Scaler':scaler[0],
                                              'Feature_selection':num_cols,
                                              'R_score (train)':r2_train,
                                              'R_score (test)':r2_test,
                                              'RMSE':RMSE,'MAE':MAE,
                                              '5-Fold Cross Validation':fcv
                                             }),ignore_index = True)
    return df_res

In [14]:
df_lr =  test_model_linear(X, y, "LinearRegression",LinearRegression(), "")

StandarScaler: 10/10

In [21]:
df_lr.sort_values(by= ["MAE"])

Unnamed: 0,Model,Details,Scaler,Feature_selection,R_score (train),R_score (test),RMSE,MAE,5-Fold Cross Validation
23,LinearRegression,,StandarScaler,10,0.549686,0.555,2.137,1.482,0.555
6,LinearRegression,,,9,0.549688,0.555,2.137,1.482,0.555
7,LinearRegression,,,10,0.549688,0.555,2.137,1.482,0.555
15,LinearRegression,,MinMaxScaler,10,0.549685,0.555,2.137,1.482,0.555
14,LinearRegression,,MinMaxScaler,9,0.549688,0.555,2.137,1.482,0.555
22,LinearRegression,,StandarScaler,9,0.549688,0.555,2.137,1.482,0.555
21,LinearRegression,,StandarScaler,8,0.549444,0.555,2.137,1.483,0.555
5,LinearRegression,,,8,0.549444,0.555,2.137,1.483,0.555
13,LinearRegression,,MinMaxScaler,8,0.549444,0.555,2.137,1.483,0.555
20,LinearRegression,,StandarScaler,7,0.498061,0.499,2.268,1.571,0.555


# Feature engineering

In [51]:
df_train

Unnamed: 0,id,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight,Age,Sex_F,Sex_I,Sex_M
0,0,1.5250,1.1750,0.3750,28.973189,12.728926,6.647958,8.348928,9,0,1,0
1,1,1.1000,0.8250,0.2750,10.418441,4.521745,2.324659,3.401940,8,0,1,0
2,2,1.3875,1.1125,0.3750,24.777463,11.339800,5.556502,6.662133,9,0,0,1
3,3,1.7000,1.4125,0.5000,50.660556,20.354941,10.991839,14.996885,11,1,0,0
4,4,1.2500,1.0125,0.3375,23.289114,11.977664,4.507570,5.953395,8,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
74046,74046,1.6625,1.2625,0.4375,50.660556,20.680960,10.361742,12.332033,10,1,0,0
74047,74047,1.0750,0.8625,0.2750,10.446791,4.323299,2.296310,3.543687,6,0,1,0
74048,74048,1.4875,1.2000,0.4125,29.483480,12.303683,7.540967,8.079607,10,1,0,0
74049,74049,1.2125,0.9625,0.3125,16.768729,8.972617,2.919999,4.280774,8,0,1,0


# Build piple model

In [34]:
def test_model(X, y, name_model, model, Details):
    
    Scalers = [['None',False],['MinMaxScaler',MinMaxScaler()],['StandarScaler',StandardScaler()]]
    
    df_res = pd.DataFrame(index = None, columns =['Model','Details','Scaler','Feature_selection','R_score (train)','R_score (test)',
                                              'RMSE','MAE','5-Fold Cross Validation'])
    
    summ = 8 # print %%%%
    i = 0
    for scaler in Scalers:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = SEED)
        
        if scaler[0] != 'None':
            sc = scaler[1].fit(X_train)
            X_train = sc.transform(X_train)
            X_test = sc.transform(X_test)
            
            
        sum_cols = X.columns.size
        
        lis_features = [8,9, 10]
        for num_cols in lis_features:
            print('\r', end='')
            print('{:.3f}'.format((i/summ)*100),'% ', end='')
            i+=1
            
            
            kBest = SelectKBest(f_regression,k = num_cols)
            kBest.fit(X_train, y_train)
            
            X_train_KBest = kBest.transform(X_train)
            X_test_KBest = kBest.transform(X_test)
           
            mod = model
            mod = model.fit(X_train_KBest, y_train)
            
            y_pred = mod.predict(X_test_KBest)
        
            r2_train = r2_score(y_train, mod.predict(X_train_KBest))
            r2_test, RMSE, MAE = Evaluating_model(y_test, y_pred)
            fcv = float(format(cross_val_score(mod, X_test, y_test).mean(),'.3f'))
            df_res = df_res.append(pd.Series({'Model':name_model,
                                              'Details':Details,
                                              'Scaler':scaler[0],
                                              'Feature_selection':num_cols,
                                              'R_score (train)':r2_train,
                                              'R_score (test)':r2_test,
                                              'RMSE':RMSE,'MAE':MAE,
                                              '5-Fold Cross Validation':fcv
                                             }),ignore_index = True)
    return df_res

## KNeighbors

In [27]:
df_KNR_0 = test_model(X, y,'KNeighbors Regressor',KNeighborsRegressor(),'None')

88.889 % 

In [None]:
df_SVR_0.sort_values(by = ["MAE"])

## Decistion Tree Regressor

In [49]:
df_DTR_0 = test_model(X.copy(), y.copy(),'Decision Tree Regressor', DecisionTreeRegressor(max_depth  = 10, min_samples_split=5, random_state = SEED),'None')

100.000 % 

In [50]:
df_DTR_0.sort_values(by = ["MAE"]).head()

Unnamed: 0,Model,Details,Scaler,Feature_selection,R_score (train),R_score (test),RMSE,MAE,5-Fold Cross Validation
0,Decision Tree Regressor,,,8,0.621293,0.543,2.167,1.467,0.432
3,Decision Tree Regressor,,MinMaxScaler,8,0.621293,0.542,2.168,1.467,0.433
2,Decision Tree Regressor,,,10,0.621774,0.541,2.17,1.468,0.432
5,Decision Tree Regressor,,MinMaxScaler,10,0.621774,0.542,2.169,1.468,0.433
6,Decision Tree Regressor,,StandarScaler,8,0.621293,0.542,2.169,1.468,0.432


## Random Forest Regression

In [33]:
df_RFR_0 = test_model(X.copy(), y.copy(),'Random Forest Regressor',
                    RandomForestRegressor(random_state = SEED),'None')

88.889 % 

In [36]:
df_RFR_0.sort_values(by = ["MAE"]).head()

Unnamed: 0,Model,Details,Scaler,Feature_selection,R_score (train),R_score (test),RMSE,MAE,5-Fold Cross Validation
1,Random Forest Regressor,,,9,0.938392,0.558,2.13,1.474,0.548
2,Random Forest Regressor,,,10,0.938415,0.558,2.13,1.475,0.548
4,Random Forest Regressor,,MinMaxScaler,9,0.938394,0.558,2.131,1.475,0.548
5,Random Forest Regressor,,MinMaxScaler,10,0.938406,0.558,2.131,1.475,0.548
7,Random Forest Regressor,,StandarScaler,9,0.938415,0.558,2.131,1.475,0.548


## linear SVR

In [39]:
df_SVR_0 = test_model(X.copy(), y.copy(),'LinearSVR',
                    LinearSVR(random_state = SEED),"None")


0.000 % 



12.500 % 



25.000 % 



100.000 % 

In [40]:
df_SVR_0.sort_values(by = ["MAE"]).head()

Unnamed: 0,Model,Details,Scaler,Feature_selection,R_score (train),R_score (test),RMSE,MAE,5-Fold Cross Validation
6,LinearSVR,,StandarScaler,8,0.527697,0.53,2.196,1.449,0.534
7,LinearSVR,,StandarScaler,9,0.527515,0.53,2.197,1.449,0.534
8,LinearSVR,,StandarScaler,10,0.52712,0.529,2.198,1.449,0.534
1,LinearSVR,,,9,0.527317,0.53,2.198,1.451,0.474
3,LinearSVR,,MinMaxScaler,8,0.524151,0.526,2.207,1.451,0.521


## GradientBoostingRegressor

In [31]:
df_GBR_0 = test_model(X.copy(), y.copy(),'GradientBoostingRegressor',
                    GradientBoostingRegressor(random_state = SEED),'None')

88.889 % 

In [38]:
df_GBR_0.sort_values(by = ["MAE"]).head()

Unnamed: 0,Model,Details,Scaler,Feature_selection,R_score (train),R_score (test),RMSE,MAE,5-Fold Cross Validation
1,GradientBoostingRegressor,,,9,0.585979,0.577,2.085,1.429,0.569
2,GradientBoostingRegressor,,,10,0.585978,0.577,2.085,1.429,0.569
4,GradientBoostingRegressor,,MinMaxScaler,9,0.585979,0.577,2.085,1.429,0.569
5,GradientBoostingRegressor,,MinMaxScaler,10,0.585978,0.577,2.085,1.429,0.569
7,GradientBoostingRegressor,,StandarScaler,9,0.585979,0.577,2.085,1.429,0.569


## Gen submission

In [46]:
def gen_submission(model):
    
    df_sub = test_raw["id"].to_frame()
    
    ## processing
    df_test = pd.get_dummies(test_raw, columns = ["Sex"])
    df_test.drop(["id"], axis=1, inplace=True)
    
    ## gen file 
    pred = base_lr.predict(df_test)
    df_sub["Age"] = pred
    df_sub.to_csv("submission_crab.csv", index =False)

In [47]:
gen_submission(base_lr)