# 1. Import Libraries

In [100]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns

# 2. Import Dataset

In [101]:
fish_dataset = pd.read_csv('Fish Market.csv')
x = fish_dataset.iloc[:, 1:].values
y = fish_dataset.iloc[:, 0].values

In [102]:
print(fish_dataset)

    Species  Weight  Length1  Length2  Length3   Height   Width
0     Bream   242.0     23.2     25.4     30.0  11.5200  4.0200
1     Bream   290.0     24.0     26.3     31.2  12.4800  4.3056
2     Bream   340.0     23.9     26.5     31.1  12.3778  4.6961
3     Bream   363.0     26.3     29.0     33.5  12.7300  4.4555
4     Bream   430.0     26.5     29.0     34.0  12.4440  5.1340
..      ...     ...      ...      ...      ...      ...     ...
154   Smelt    12.2     11.5     12.2     13.4   2.0904  1.3936
155   Smelt    13.4     11.7     12.4     13.5   2.4300  1.2690
156   Smelt    12.2     12.1     13.0     13.8   2.2770  1.2558
157   Smelt    19.7     13.2     14.3     15.2   2.8728  2.0672
158   Smelt    19.9     13.8     15.0     16.2   2.9322  1.8792

[159 rows x 7 columns]


In [103]:
print(x)

[[2.42000e+02 2.32000e+01 2.54000e+01 3.00000e+01 1.15200e+01 4.02000e+00]
 [2.90000e+02 2.40000e+01 2.63000e+01 3.12000e+01 1.24800e+01 4.30560e+00]
 [3.40000e+02 2.39000e+01 2.65000e+01 3.11000e+01 1.23778e+01 4.69610e+00]
 [3.63000e+02 2.63000e+01 2.90000e+01 3.35000e+01 1.27300e+01 4.45550e+00]
 [4.30000e+02 2.65000e+01 2.90000e+01 3.40000e+01 1.24440e+01 5.13400e+00]
 [4.50000e+02 2.68000e+01 2.97000e+01 3.47000e+01 1.36024e+01 4.92740e+00]
 [5.00000e+02 2.68000e+01 2.97000e+01 3.45000e+01 1.41795e+01 5.27850e+00]
 [3.90000e+02 2.76000e+01 3.00000e+01 3.50000e+01 1.26700e+01 4.69000e+00]
 [4.50000e+02 2.76000e+01 3.00000e+01 3.51000e+01 1.40049e+01 4.84380e+00]
 [5.00000e+02 2.85000e+01 3.07000e+01 3.62000e+01 1.42266e+01 4.95940e+00]
 [4.75000e+02 2.84000e+01 3.10000e+01 3.62000e+01 1.42628e+01 5.10420e+00]
 [5.00000e+02 2.87000e+01 3.10000e+01 3.62000e+01 1.43714e+01 4.81460e+00]
 [5.00000e+02 2.91000e+01 3.15000e+01 3.64000e+01 1.37592e+01 4.36800e+00]
 [3.40000e+02 2.95000e+01

# 2. Data Preprocessing 

In [104]:
#check for missing values
fish_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  159 non-null    object 
 1   Weight   159 non-null    float64
 2   Length1  159 non-null    float64
 3   Length2  159 non-null    float64
 4   Length3  159 non-null    float64
 5   Height   159 non-null    float64
 6   Width    159 non-null    float64
dtypes: float64(6), object(1)
memory usage: 8.8+ KB


# No Encodeing is Required as this stage, so we can skip

# Split Dataset into Train and Test set

In [105]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

In [106]:
print(x_train)

[[8.40000e+02 3.25000e+01 3.50000e+01 3.73000e+01 1.14884e+01 7.79570e+00]
 [3.00000e+02 3.48000e+01 3.73000e+01 3.98000e+01 6.28840e+00 4.01980e+00]
 [5.40000e+02 2.85000e+01 3.10000e+01 3.40000e+01 1.07440e+01 6.56200e+00]
 [3.00000e+02 2.52000e+01 2.73000e+01 2.87000e+01 8.32300e+00 5.13730e+00]
 [1.60000e+02 2.11000e+01 2.25000e+01 2.50000e+01 6.40000e+00 3.80000e+00]
 [2.00000e+02 2.12000e+01 2.30000e+01 2.58000e+01 1.03458e+01 3.66360e+00]
 [9.75000e+02 3.74000e+01 4.10000e+01 4.59000e+01 1.86354e+01 6.74730e+00]
 [2.70000e+02 2.41000e+01 2.65000e+01 2.93000e+01 8.14540e+00 4.24850e+00]
 [4.30000e+02 3.55000e+01 3.80000e+01 4.05000e+01 7.29000e+00 4.57650e+00]
 [5.56000e+02 3.20000e+01 3.45000e+01 3.65000e+01 1.02565e+01 6.38750e+00]
 [2.50000e+02 2.59000e+01 2.80000e+01 2.94000e+01 7.82040e+00 4.20420e+00]
 [1.22000e+01 1.15000e+01 1.22000e+01 1.34000e+01 2.09040e+00 1.39360e+00]
 [5.00000e+02 2.91000e+01 3.15000e+01 3.64000e+01 1.37592e+01 4.36800e+00]
 [7.00000e+02 3.45000e+01

# Feature Scaling
###We perform feature scaling after splitting the dataset to avoid any data leakage. 

In [107]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)


In [108]:
print(x_train)

[[ 1.21863767  0.5756523   0.56626705  0.47812066  0.56968932  1.93775927]
 [-0.31016001  0.80275661  0.77819095  0.69168749 -0.66001887 -0.26110353]
 [ 0.36930562  0.18068827  0.19770374  0.19621243  0.39365186  1.21932472]
 [-0.31016001 -0.14515706 -0.14321732 -0.25654927 -0.17887189  0.38966296]
 [-0.70651497 -0.54999518 -0.58549328 -0.57262819 -0.63362744 -0.38910217]
 [-0.59327069 -0.54012108 -0.53942287 -0.5042868   0.29948459 -0.46853353]
 [ 1.60083709  1.05948323  1.119112    1.21279058  2.25982862  1.32723257]
 [-0.39509322 -0.25377216 -0.21692998 -0.20529323 -0.22087116 -0.12792206]
 [ 0.05788387  0.87187532  0.84268952  0.75148621 -0.42315815  0.06308591]
 [ 0.41460333  0.52628179  0.52019663  0.40977927  0.27836672  1.11770615]
 [-0.45171535 -0.07603835 -0.07871874 -0.19675055 -0.29772792 -0.15371978]
 [-1.12495256 -1.49790885 -1.53454379 -1.56357831 -1.65277174 -1.79044845]
 [ 0.25606135  0.23993287  0.24377415  0.40123659  1.10669342 -0.05833226]
 [ 0.82228272  0.77313431

In [109]:
print(x_test)

[[ 1.75822796e+00  9.81447016e-01  1.05510028e+00  1.22475602e+00
   2.35813729e+00  1.58117331e+00]
 [-6.36012680e-01 -5.67989281e-01 -5.79921948e-01 -5.20607885e-01
  -5.49274114e-01 -6.10853476e-01]
 [ 6.69936758e-01  5.30901710e-01  5.44155836e-01  7.26080621e-01
   1.45198229e+00  6.32335166e-01]
 [-6.08805400e-01 -4.47111272e-01 -4.26638614e-01 -4.92903696e-01
  -6.07818980e-01 -3.92425927e-01]
 [-7.31238160e-01 -6.99856200e-01 -6.82110837e-01 -7.69945587e-01
  -7.83046073e-01 -9.56879284e-01]
 [ 8.05973157e-01  7.06724269e-01  6.97439171e-01  8.64601566e-01
   1.34317849e+00  1.00121736e+00]
 [-5.67994480e-01 -4.47111272e-01 -4.77733058e-01 -4.65199507e-01
  -4.31708959e-01 -4.42908727e-01]
 [ 3.97863958e-01  4.53979341e-01  4.41966947e-01  6.33733324e-01
   1.31999599e+00  3.96351263e-01]
 [-9.62500040e-01 -6.11944921e-01 -6.31016393e-01 -6.03720453e-01
  -4.67637305e-01 -5.72892000e-01]
 [-5.67994480e-01 -2.82277623e-01 -2.73355279e-01 -3.54382751e-01
  -4.90321743e-01 -2.5926

# Train Logistic Regression Model

In [110]:
classifier = LogisticRegression(random_state= 0)
classifier.fit(x_train, y_train)

# Predicting the Test set results

In [111]:
classifier.predict(x_test)

array(['Bream', 'Perch', 'Bream', 'Perch', 'Perch', 'Bream', 'Perch',
       'Bream', 'Roach', 'Perch', 'Perch', 'Perch', 'Smelt', 'Perch',
       'Perch', 'Perch', 'Parkki', 'Perch', 'Perch', 'Perch', 'Smelt',
       'Perch', 'Perch', 'Bream', 'Perch', 'Bream', 'Bream', 'Smelt',
       'Pike', 'Perch', 'Perch', 'Perch'], dtype=object)

In [112]:
y_pred = classifier.predict(x_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[['Bream' 'Bream']
 ['Perch' 'Roach']
 ['Bream' 'Bream']
 ['Perch' 'Perch']
 ['Perch' 'Perch']
 ['Bream' 'Bream']
 ['Perch' 'Roach']
 ['Bream' 'Bream']
 ['Roach' 'Roach']
 ['Perch' 'Perch']
 ['Perch' 'Perch']
 ['Perch' 'Perch']
 ['Smelt' 'Perch']
 ['Perch' 'Perch']
 ['Perch' 'Whitefish']
 ['Perch' 'Perch']
 ['Parkki' 'Parkki']
 ['Perch' 'Roach']
 ['Perch' 'Perch']
 ['Perch' 'Perch']
 ['Smelt' 'Smelt']
 ['Perch' 'Roach']
 ['Perch' 'Roach']
 ['Bream' 'Bream']
 ['Perch' 'Perch']
 ['Bream' 'Bream']
 ['Bream' 'Bream']
 ['Smelt' 'Smelt']
 ['Pike' 'Pike']
 ['Perch' 'Perch']
 ['Perch' 'Perch']
 ['Perch' 'Perch']]


In [113]:
#how many is predicted right
numofPrediction_isCorrect = (y_pred == y_test).sum()
print(numofPrediction_isCorrect)

#how many is predicted wrong
numofPrediction_isIncorrect = (y_pred != y_test).sum()
print(numofPrediction_isIncorrect)

#how many predictions all together
count = 0
for c in range(len(y_pred)):
   count = count + 1

print(count)

25
7
32


# Confusion Matrix and Accuracy of Model

In [114]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 7  0  0  0  0  0  0]
 [ 0  1  0  0  0  0  0]
 [ 0  0 13  0  0  1  0]
 [ 0  0  0  1  0  0  0]
 [ 0  0  5  0  1  0  0]
 [ 0  0  0  0  0  2  0]
 [ 0  0  1  0  0  0  0]]


In [115]:
p = accuracy_score(y_test, y_pred)
print(p * 100)

78.125
