# Clean and prepare the dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
df=pd.read_csv("FishMarket.csv")

In [5]:
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [6]:
df.tail()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.43,1.269
156,Smelt,12.2,12.1,13.0,13.8,2.277,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672
158,Smelt,19.9,13.8,15.0,16.2,2.9322,1.8792


In [7]:
df.index

RangeIndex(start=0, stop=159, step=1)

In [8]:
df.columns

Index(['Species', 'Weight', 'Length1', 'Length2', 'Length3', 'Height',
       'Width'],
      dtype='object')

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [10]:
df.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


In [12]:
print(df['Species'].unique())

['Bream' 'Roach' 'Whitefish' 'Parkki' 'Perch' 'Pike' 'Smelt']


In [13]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [14]:
df['Species'] = le.fit_transform(df['Species'])

# Split the dataset in the ratio 75;25 for train-test split

In [15]:
X=df[['Species', 'Weight', 'Length1', 
      'Length2', 'Length3', 'Height']]
y=df['Width']


In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=101)

In [18]:
# Check train and test data shapes 
print(X_train.shape) 
print(X_test.shape) 
print(y_train.shape) 
print(y_test.shape)

(119, 6)
(40, 6)
(119,)
(40,)


# Build linear regression model, polynomial model, lasso model and ridge model

# linear regression model

In [19]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [20]:
lr.fit(X_train,y_train)

LinearRegression()

In [21]:
lr_pred = lr.predict(X_test)

# polynomial model

In [22]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly_train = poly_reg.fit_transform(X_train)
X_poly_test = poly_reg.fit_transform(X_test)

In [23]:
poly = LinearRegression()
poly.fit(X_poly_train, y_train)

LinearRegression()

In [24]:
poly_pred = poly.predict(X_poly_test)

# lasso model

In [25]:
from sklearn.linear_model import Lasso
l = Lasso()

In [26]:
l.fit(X_train,y_train)

Lasso()

In [27]:
l_pred = l.predict(X_test)

# ridge model

In [28]:
from sklearn.linear_model import Ridge
r = Ridge()

In [29]:
r.fit(X_train,y_train)

Ridge()

In [30]:
r_pred = r.predict(X_test)

# Print the MSE, RMSE and Rsquare for each model

In [31]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

# linear regression model

In [32]:
print('Mean Squared Error:',mean_squared_error(y_test,lr_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,lr_pred)))
print('R2:',r2_score(y_test,lr_pred))

Mean Squared Error: 0.3008567818380089
Root Mean Squared Error: 0.5485041311038678
R2: 0.8788997791259767


# polynomial model

In [37]:
print('Mean Squared Error:',mean_squared_error(y_test,poly_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,poly_pred)))
print('R2:',r2_score(y_test,poly_pred))

Mean Squared Error: 0.09617830704236158
Root Mean Squared Error: 0.3101262759624885
R2: 0.9612865159463454


# lasso model

In [36]:
print('Mean Squared Error:',mean_squared_error(y_test,l_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,l_pred)))
print('R2:',r2_score(y_test,l_pred))

Mean Squared Error: 0.9728879210041039
Root Mean Squared Error: 0.9863508103124892
R2: 0.6083952590349028


# ridge model

In [35]:
print('Mean Squared Error:',mean_squared_error(y_test,r_pred))
print('Root Mean Squared Error:',np.sqrt(mean_squared_error(y_test,r_pred)))
print('R2:',r2_score(y_test,r_pred))

Mean Squared Error: 0.29424560161308033
Root Mean Squared Error: 0.5424440999891881
R2: 0.881560897085777
