In [35]:
import pandas as pd
df=pd.read_csv('abalone.csv')
df.shape

(4177, 9)

In [2]:
df.dtypes

Sex                object
Length            float64
Diameter          float64
Height            float64
Whole weight      float64
Shucked weight    float64
Viscera weight    float64
Shell weight      float64
Rings               int64
dtype: object

In [3]:
a=df.corr()
a['Rings'].abs().sort_values(ascending=False)

Rings             1.000000
Shell weight      0.627574
Diameter          0.574660
Height            0.557467
Length            0.556720
Whole weight      0.540390
Viscera weight    0.503819
Shucked weight    0.420884
Name: Rings, dtype: float64

In [4]:
df['Sex'].value_counts()

M    1528
I    1342
F    1307
Name: Sex, dtype: int64

In [5]:
d={'M':1,'F':2,'I':3}
df['Sex']=df['Sex'].map(d)
df.head(5)

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [6]:
x=df.iloc[:,:-1]
y=df['Rings']

In [7]:
from sklearn.model_selection import train_test_split
xtr,xte,ytr,yte=train_test_split(x,y,test_size=0.2,random_state=42)

In [8]:
a=xtr['Sex'].values
b=xte['Sex'].values
xtr.drop(columns=['Sex'],axis=1,inplace=True)
xte.drop(columns=['Sex'],axis=1,inplace=True)

In [9]:
from sklearn.preprocessing import StandardScaler
sd=StandardScaler()
sd.fit(xtr)
xtr=sd.transform(xtr)
xtr=pd.DataFrame(xtr,columns=['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight'])
xtr['Sex']=a
xtr.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex
0,0.210175,0.369725,-0.34795,-0.323164,-0.325407,-0.40513,-0.212132,3
1,-0.419444,-0.542386,-0.936082,-0.665985,-0.480126,-0.820932,-0.712419,3
2,-1.846581,-1.859881,-1.64184,-1.38825,-1.354628,-1.346395,-1.391379,2
3,-2.098428,-2.113245,-1.759467,-1.453355,-1.363597,-1.533734,-1.462848,3
4,-0.251546,-0.289022,-0.112697,-0.409632,-0.437522,-0.350299,-0.176398,1


In [10]:
xte=sd.transform(xte)
xte=pd.DataFrame(xte,columns=['Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight'])
xte['Sex']=b
xte.head(5)

Unnamed: 0,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Sex
0,0.671895,0.471071,0.475434,0.554743,0.271048,1.102722,0.609767,1
1,0.545971,0.319052,0.240182,0.084763,0.118571,0.312242,0.038011,1
2,0.294124,0.369725,1.298819,0.305511,-0.249168,0.399058,0.681236,2
3,0.923743,0.825781,0.710687,0.876201,0.797991,0.782875,1.002849,2
4,-0.419444,-0.238349,0.122555,-0.434047,-0.563092,-0.665577,-0.176398,1


In [11]:
xtr1=xtr.copy()
xte1=xte.copy()

In [13]:
xtr1.shape

(3341, 8)

# Assumptions of multicollinearity

In [14]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
def calc_vif(X):

    # Calculating VIF
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

    return(vif)

print(calc_vif(xtr))

        variables         VIF
0          Length   42.485494
1        Diameter   43.291185
2          Height    3.261560
3    Whole weight  125.250920
4  Shucked weight   32.177344
5  Viscera weight   17.897838
6    Shell weight   23.306422
7             Sex    1.034061


In [15]:
xtr1.drop(['Whole weight','Diameter', 'Viscera weight'],axis=1,inplace=True)
xte1.drop(['Whole weight','Diameter', 'Viscera weight'],axis=1,inplace=True)

In [17]:
print(calc_vif(xtr1))

        variables       VIF
0          Length  7.680005
1          Height  3.202338
2  Shucked weight  6.281809
3    Shell weight  6.459267
4             Sex  1.032646


# Linear Regression

In [18]:
from sklearn.linear_model import LinearRegression
l=LinearRegression()
l.fit(xtr1,ytr)
ytepred=l.predict(xte1)
ytrpred=l.predict(xtr1)

In [19]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
print("Mean Squared Error of Train Data is:",mean_squared_error(ytr,ytrpred))
print("Mean Squared Error of Test Data is:",mean_squared_error(yte,ytepred))

Mean Squared Error of Train Data is: 4.992172351283302
Mean Squared Error of Test Data is: 5.2684000709757015


In [20]:
print("Mean Absolute Error of Train Data is:",mean_absolute_error(ytr,ytrpred))
print("Mean Absolute Error of Test Data is:",mean_absolute_error(yte,ytepred))

Mean Absolute Error of Train Data is: 1.6034938133050303
Mean Absolute Error of Test Data is: 1.6586642121905544


# Ridge

In [21]:
# Ridge Regression

from numpy import arange
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Ridge
# load the dataset
# define model
model = Ridge()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.001,0.01,0.1,1,10]
# define search
search = GridSearchCV(model, grid, scoring='r2', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xtr1, ytr)
# summarize
print('R Squared Error: %.3f' % results.best_score_)
print('Best Param: %s' % results.best_params_)

R Squared Error: 0.495
Best Param: {'alpha': 1}


In [23]:
rd=results.best_estimator_
rd.fit(xtr1,ytr)
ytepred=rd.predict(xte1)
ytrpred=rd.predict(xtr1)
print("Mean Squared Error of Train Data is:",mean_squared_error(ytr,ytrpred))
print("Mean Squared Error of Test Data is:",mean_squared_error(yte,ytepred))

Mean Squared Error of Train Data is: 4.992185089149074
Mean Squared Error of Test Data is: 5.267749957810223


In [36]:
print("Mean Absolute Error of Train Data is:",mean_absolute_error(ytr,ytrpred))
print("Mean Absolute Error of Test Data is:",mean_absolute_error(yte,ytepred))

Mean Absolute Error of Train Data is: 1.603705217223507
Mean Absolute Error of Test Data is: 1.6547683033862501


# Lasso

In [26]:
# define model
from sklearn.linear_model import Lasso
model = Lasso()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.001,0.01,0.1,1,10]
# define search
search = GridSearchCV(model, grid, scoring='r2', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xtr1, ytr)
# summarize
print('Best Params : %s' % results.best_params_)

Best Params : {'alpha': 0.001}


In [31]:
ls=results.best_estimator_
ls.fit(xtr1,ytr)
ytepred=ls.predict(xte1)
ytrpred=ls.predict(xtr1)
print("Mean Squared Error of Train Data is:",mean_squared_error(ytr,ytrpred))
print("Mean Squared Error of Test Data is:",mean_squared_error(yte,ytepred))

Mean Squared Error of Train Data is: 4.995372797352421
Mean Squared Error of Test Data is: 5.263093735205334


In [32]:
print("Mean Absolute Error of Train Data is:",mean_absolute_error(ytr,ytrpred))
print("Mean Absolute Error of Test Data is:",mean_absolute_error(yte,ytepred))

Mean Absolute Error of Train Data is: 1.603705217223507
Mean Absolute Error of Test Data is: 1.6547683033862501


# Elastic Net

In [39]:
# define model
from sklearn.linear_model import ElasticNet
model = ElasticNet()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define grid
grid = dict()
grid['alpha'] = [0.01,0.1,1,10]
grid['max_iter']=[100,250,500]
grid['l1_ratio']=arange(0, 1, 0.1)
search = GridSearchCV(model, grid, scoring='r2', cv=cv, n_jobs=-1)
# perform the search
results = search.fit(xtr1, ytr)
# summarize
print('Best Params : %s' % results.best_params_)

Best Params : {'alpha': 0.01, 'l1_ratio': 0.9, 'max_iter': 100}


In [41]:
el=results.best_estimator_
el.fit(xtr1,ytr)
ytrpred=el.predict(xtr1)
ytepred=el.predict(xte1)
print("Mean Squared Error of Train Data is:",mean_squared_error(ytr,ytrpred))
print("Mean Squared Error of Test Data is:",mean_squared_error(yte,ytepred))

Mean Squared Error of Train Data is: 4.995372797352421
Mean Squared Error of Test Data is: 5.263093735205334


In [42]:
print("Mean Absolute Error of Train Data is:",mean_absolute_error(ytr,ytrpred))
print("Mean Absolute Error of Test Data is:",mean_absolute_error(yte,ytepred))

Mean Absolute Error of Train Data is: 1.603705217223507
Mean Absolute Error of Test Data is: 1.6547683033862501


# AdaBoost Regressor

In [51]:
from sklearn.model_selection import RepeatedKFold,KFold,cross_val_score
from sklearn.ensemble import AdaBoostRegressor
ada=AdaBoostRegressor()
# evaluate the model
grid = dict()
grid['n_estimators'] = [100, 250, 500]
grid['learning_rate'] = [0.0001, 0.001]
# define the grid search procedure
grid_search = GridSearchCV(estimator=ada, param_grid=grid, n_jobs=-1, cv=10, scoring='r2',verbose=2)
# execute the grid search
grid_result = grid_search.fit(xtr, ytr)
# summarize the best score and configuration
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best: 0.438781 using {'learning_rate': 0.001, 'n_estimators': 500}


In [52]:
ada=grid_result.best_estimator_
ada.fit(xtr,ytr)
trpred=ada.predict(xtr)
tepred=ada.predict(xte)
print("Mean Squared Error of Train Data is:",mean_squared_error(ytr,trpred))
print("Mean Squared Error of Test Data is:",mean_squared_error(yte,tepred))

Mean Squared Error of Train Data is: 5.5135437953461945
Mean Squared Error of Test Data is: 6.093752206186159


In [48]:
print("Mean Absolute Error of Train Data is:",mean_absolute_error(ytr,trpred))
print("Mean Absolute Error of Test Data is:",mean_absolute_error(yte,tepred))

Mean Absolute Error of Train Data is: 1.603705217223507
Mean Absolute Error of Test Data is: 1.6547683033862501


# Random Forest Regressor

In [62]:
from sklearn.ensemble import RandomForestRegressor
param_grid = {
    'max_depth': [3,5],
    'n_estimators': [100,250,500]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf,scoring="r2", param_grid = param_grid, 
                          cv = 10 , n_jobs = -1, verbose = 2)

# Fit the grid search to the data
grid_search.fit(xtr,ytr)
print('Best Params : %s' % grid_search.best_params_)


Fitting 10 folds for each of 6 candidates, totalling 60 fits
Best Params : {'max_depth': 5, 'n_estimators': 250}


In [63]:
rf=grid_result.best_estimator_
rf.fit(xtr,ytr)
trpred=rf.predict(xtr)
tepred=rf.predict(xte)
print("Mean Squared Error of Train Data is:",mean_squared_error(ytr,trpred))
print("Mean Squared Error of Test Data is:",mean_squared_error(yte,tepred))

Mean Squared Error of Train Data is: 5.52203411072555
Mean Squared Error of Test Data is: 6.113652874671505


In [64]:
print("Mean Absolute Error of Train Data is:",mean_absolute_error(ytr,trpred))
print("Mean Absolute Error of Test Data is:",mean_absolute_error(yte,tepred))

Mean Absolute Error of Train Data is: 1.6981196056501926
Mean Absolute Error of Test Data is: 1.7637788308114235


In [60]:
!pip install prettytable

Collecting prettytable
  Downloading prettytable-3.5.0-py3-none-any.whl (26 kB)
Installing collected packages: prettytable
Successfully installed prettytable-3.5.0


# Observations

## Mean Squared Error

In [61]:
from prettytable import PrettyTable
tab = PrettyTable(["Sr No", "Model", "Best Hyperparameters","Mean Squared Error of all Data"])
# Add rows
tab.add_row(["1.", "Linear Regression", "-","1.Train Data :  4.9922 \n 2.Test Data : 5.2684\n"])
tab.add_row(["2.", "Ridge Regression", "{'alpha' : 1}","1.Train Data :  4.9922 \n 2.Test Data : 5.26775\n"])
tab.add_row(["3.", "Lasso Regression", "{'alpha' : 0.001}","1.Train Data :  4.9954 \n 2.Test Data : 5.2631"])
tab.add_row(["4.", "ElasticNet Regression", "{'alpha' : 0.01,'l1_ratio': 0.9, 'max_iter': 100}","1.Train Data :  4.9954 \n 2.Test Data : 5.2631\n"])
tab.add_row(["5.","Adaboost Regressor","{'learning_rate': 0.001, 'n_estimators': 500}","1.Train Data :  5.5135 \n 2.Test Data : 6.0938\n"])
tab.add_row(["6.","Random Forest Regressor","{'max_depth': 5, 'n_estimators': 100}","1.Train Data :  5.5293 \n 2.Test Data : 6.1128\n"])
print(tab)

+-------+-------------------------+---------------------------------------------------+--------------------------------+
| Sr No |          Model          |                Best Hyperparameters               | Mean Squared Error of all Data |
+-------+-------------------------+---------------------------------------------------+--------------------------------+
|   1.  |    Linear Regression    |                         -                         |    1.Train Data :  4.9922      |
|       |                         |                                                   |      2.Test Data : 5.2684      |
|       |                         |                                                   |                                |
|   2.  |     Ridge Regression    |                   {'alpha' : 1}                   |    1.Train Data :  4.9922      |
|       |                         |                                                   |      2.Test Data : 5.26775     |
|       |                       

## Mean Absolute Error

In [65]:
from prettytable import PrettyTable
tab = PrettyTable(["Sr No", "Model", "Best Hyperparameters","Mean Squared Error of all Data"])
# Add rows
tab.add_row(["1.", "Linear Regression", "-","1.Train Data :  1.6035 \n 2.Test Data : 1.6587\n"])
tab.add_row(["2.", "Ridge Regression", "{'alpha' : 1}","1.Train Data :  1.6037 \n 2.Test Data : 1.6548\n"])
tab.add_row(["3.", "Lasso Regression", "{'alpha' : 0.001}","1.Train Data :  1.6037 \n 2.Test Data : 1.6547\n"])
tab.add_row(["4.", "ElasticNet Regression", "{'alpha' : 0.01,'l1_ratio': 0.9, 'max_iter': 100}","1.Train Data :  1.6037 \n 2.Test Data : 1.6548\n"])
tab.add_row(["5.","Adaboost Regressor","{'learning_rate': 0.001, 'n_estimators': 500}","1.Train Data :  1.6037 \n 2.Test Data : 1.6547\n"])
tab.add_row(["6.","Random Forest Regressor","{'max_depth': 5, 'n_estimators': 100}","1.Train Data :  1.6981 \n 2.Test Data : 1.7638\n"])
print(tab)

+-------+-------------------------+---------------------------------------------------+--------------------------------+
| Sr No |          Model          |                Best Hyperparameters               | Mean Squared Error of all Data |
+-------+-------------------------+---------------------------------------------------+--------------------------------+
|   1.  |    Linear Regression    |                         -                         |    1.Train Data :  1.6035      |
|       |                         |                                                   |      2.Test Data : 1.6587      |
|       |                         |                                                   |                                |
|   2.  |     Ridge Regression    |                   {'alpha' : 1}                   |    1.Train Data :  1.6037      |
|       |                         |                                                   |      2.Test Data : 1.6548      |
|       |                       

In [None]:
Mean Absolute Error of Train Data is: 1.603705217223507
Mean Absolute Error of Test Data is: 1.6547683033862501