# Feature-predicting study on fishes

In [11]:
#Importing librairies
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
import matplotlib.pyplot as plt

In [12]:
#Importing the dataset
df = pd.read_csv('Fish.csv')

In [13]:
#Checking the value types of the columns in the the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


In [14]:
#Exploring the statistical information of each column
df.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,159.0,159.0,159.0,159.0,159.0,159.0
mean,398.326415,26.24717,28.415723,31.227044,8.970994,4.417486
std,357.978317,9.996441,10.716328,11.610246,4.286208,1.685804
min,0.0,7.5,8.4,8.8,1.7284,1.0476
25%,120.0,19.05,21.0,23.15,5.9448,3.38565
50%,273.0,25.2,27.3,29.4,7.786,4.2485
75%,650.0,32.7,35.5,39.65,12.3659,5.5845
max,1650.0,59.0,63.4,68.0,18.957,8.142


In [9]:
#Checking how the dataset is presented
df.head()

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.52,4.02
1,Bream,290.0,24.0,26.3,31.2,12.48,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.73,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.444,5.134


In [16]:
#Checking the different species of fish
df['Species'].unique()

array(['Bream', 'Roach', 'Whitefish', 'Parkki', 'Perch', 'Pike', 'Smelt'],
      dtype=object)

In [17]:
#Finding the Pearson correlation coefficient between the height values of Perch and and their weight
perch = df['Species']=='Perch'
dfperch = df[perch]

dfperch.corr(method='pearson')

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
Weight,1.0,0.958361,0.958656,0.959506,0.968441,0.963943
Length1,0.958361,1.0,0.999713,0.999427,0.98542,0.974447
Length2,0.958656,0.999713,1.0,0.999779,0.985584,0.974617
Length3,0.959506,0.999427,0.999779,1.0,0.985909,0.975131
Height,0.968441,0.98542,0.985584,0.985909,1.0,0.982943
Width,0.963943,0.974447,0.974617,0.975131,0.982943,1.0


In [20]:
#Computing linear regression between a bream's length, height and width and its weight.
bream = df['Species']=='Bream'
dfbream = df[bream]

X_Length=np.array(dfbream["Length1"]).reshape(-1,1)
X_Height=np.array(dfbream["Height"]).reshape(-1,1)
X_Width=np.array(dfbream["Width"]).reshape(-1,1)

new_X=np.concatenate((X_Length, X_Height, X_Width), axis = 1)

new_y=np.array(dfbream["Weight"]).reshape(-1,1)

In [21]:
#Finding the exact coefficients and intercept values
reg_fish_new = LinearRegression().fit(new_X, new_y)
print(f'f(Length1,Height,Width)= {round(reg_fish_new.coef_[0,0],2)} * Length1 + {round(reg_fish_new.coef_[0,1],2)} * Height + {round(reg_fish_new.coef_[0,2],2)} * Width + {reg_fish_new.intercept_[0]}')

f(Length1,Height,Width)= 12.81 * Length1 + 63.13 * Height + 51.63 * Width + -1009.0246262704118


In [19]:
dfbream.describe()

Unnamed: 0,Weight,Length1,Length2,Length3,Height,Width
count,35.0,35.0,35.0,35.0,35.0,35.0
mean,617.828571,30.305714,33.108571,38.354286,15.183211,5.427614
std,209.205709,3.593699,3.911925,4.157866,1.964707,0.721509
min,242.0,23.2,25.4,30.0,11.52,4.02
25%,462.5,28.0,30.35,35.65,13.9589,4.9434
50%,610.0,30.4,33.0,38.5,14.9544,5.2801
75%,717.0,31.95,35.0,40.75,16.3609,6.0716
max,1000.0,38.0,41.0,46.5,18.957,6.7497


In [22]:
#Find the weight of a 25-length1, 4.5-wide and 12.5-high fish
fish=reg_fish_new.predict(np.array([25,4.5,12.5]).reshape(1,-1))
round(fish[0][0],2)

240.66

In [47]:
#Computing a linear regression between a perch's length anb height with its weight.

perch = df['Species']=='Perch'
dfperch3 = df[perch]

X_L=np.array(dfperch3["Length3"]).reshape(-1,1)
X_H=np.array(dfperch3["Height"]).reshape(-1,1)

X=np.concatenate((X_L, X_H), axis = 1)

y=np.array(dfperch3["Weight"]).reshape(-1,1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=15)

regperch = LinearRegression().fit(X_train, y_train)

In [45]:
#Predicting values for the testing set and computing the coefficient of determination
reg_perch_pred = regperch.predict(X_test)

print('Coefficient of determination: %.2f'
      % round(r2_score(y_test, reg_perch_pred),2))

Coefficient of determination: 0.93
