### Using pandas to process the .csv file into a readable DataFrame

In [1]:
import pandas as pd
df = pd.read_csv("Fish.csv")
df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,Bream,242.0,23.2,25.4,30.0,11.5200,4.0200
1,Bream,290.0,24.0,26.3,31.2,12.4800,4.3056
2,Bream,340.0,23.9,26.5,31.1,12.3778,4.6961
3,Bream,363.0,26.3,29.0,33.5,12.7300,4.4555
4,Bream,430.0,26.5,29.0,34.0,12.4440,5.1340
...,...,...,...,...,...,...,...
154,Smelt,12.2,11.5,12.2,13.4,2.0904,1.3936
155,Smelt,13.4,11.7,12.4,13.5,2.4300,1.2690
156,Smelt,12.2,12.1,13.0,13.8,2.2770,1.2558
157,Smelt,19.7,13.2,14.3,15.2,2.8728,2.0672


### Use of a label encoding to convert text data in the "Species" column into numerical data
#### Label encoding converts unique data into corresponding incremental values
##### -> ie. 1 for Bream, 5 for Smelt

In [2]:
from sklearn.preprocessing import LabelEncoder

#label encoding to convert species to numerical values

encoder = LabelEncoder()
species = df["Species"]
df.drop("Species", axis=1)
df["Species"] = encoder.fit_transform(species)
print(df)

     Species  Weight  Length1  Length2  Length3   Height   Width
0          0   242.0     23.2     25.4     30.0  11.5200  4.0200
1          0   290.0     24.0     26.3     31.2  12.4800  4.3056
2          0   340.0     23.9     26.5     31.1  12.3778  4.6961
3          0   363.0     26.3     29.0     33.5  12.7300  4.4555
4          0   430.0     26.5     29.0     34.0  12.4440  5.1340
..       ...     ...      ...      ...      ...      ...     ...
154        5    12.2     11.5     12.2     13.4   2.0904  1.3936
155        5    13.4     11.7     12.4     13.5   2.4300  1.2690
156        5    12.2     12.1     13.0     13.8   2.2770  1.2558
157        5    19.7     13.2     14.3     15.2   2.8728  2.0672
158        5    19.9     13.8     15.0     16.2   2.9322  1.8792

[159 rows x 7 columns]


### MinMaxScaler helps to transform data (normalize) into a given range

In [3]:
#MinMaxScaler helps process the data such that all values n are 0 <= n <= 1 for processing
import numpy as np
from sklearn.preprocessing import MinMaxScaler

for col in df.columns:
    col_data = np.array(df[col]).reshape(-1, 1)
    
    df[col] = MinMaxScaler().fit_transform(col_data)

df

Unnamed: 0,Species,Weight,Length1,Length2,Length3,Height,Width
0,0.000000,0.146667,0.304854,0.309091,0.358108,0.568334,0.418978
1,0.000000,0.175758,0.320388,0.325455,0.378378,0.624055,0.459235
2,0.000000,0.206061,0.318447,0.329091,0.376689,0.618123,0.514279
3,0.000000,0.220000,0.365049,0.374545,0.417230,0.638566,0.480365
4,0.000000,0.260606,0.368932,0.374545,0.425676,0.621966,0.576004
...,...,...,...,...,...,...,...
154,0.833333,0.007394,0.077670,0.069091,0.077703,0.021012,0.048771
155,0.833333,0.008121,0.081553,0.072727,0.079392,0.040723,0.031208
156,0.833333,0.007394,0.089320,0.083636,0.084459,0.031842,0.029347
157,0.833333,0.011939,0.110680,0.107273,0.108108,0.066424,0.143719


### Train_test_split is used to split the data into a training set and testing set to check for accuracy post-training

In [4]:
from sklearn.model_selection import train_test_split

y = df["Weight"]
x = df.drop("Weight", axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Linear Regression model from scikit-learn

In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
lr_model = LinearRegression()
lr_model.fit(x_train, y_train)

### Use of testing data to find out the accuracy of the Linear Regression Model. The closer the r2_score is to 1, the more accurate

In [10]:
y_results = lr_model.predict(x_test)

from sklearn.metrics import r2_score
r2_score(y_test, y_results)

0.9034878699241475