# Import all required libraries

In [1]:
import pandas
import numpy

# Read Data

In [2]:
data = pandas.read_csv('Penguins.csv')
data.head()

Unnamed: 0,studyName,SampleNumber,Species,Region,Island,Stage,Individual ID,ClutchCompletion,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Sex,Delta15N,Delta13C,Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


# Drop Unnessesary columns

In [3]:
data = data.drop(columns = ['studyName', 'SampleNumber', 'Individual ID', 'Comments'])
data.head()

Unnamed: 0,Species,Region,Island,Stage,ClutchCompletion,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Sex,Delta15N,Delta13C
0,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.1,18.7,181.0,3750.0,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302
3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,,,,,,,
4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",Yes,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426


# Transform Non Numerical values into Numerical Values using LabelEncoder

In [4]:
from sklearn.preprocessing import LabelEncoder

le_species = LabelEncoder()
le_Region= LabelEncoder()
le_Island = LabelEncoder()
le_ClutchCompletion = LabelEncoder()
le_Stage = LabelEncoder()
le_Sex = LabelEncoder()

In [5]:
data['Species'] = le_species.fit_transform(data['Species'])
data['Region'] = le_Region.fit_transform(data['Region'])
data['Island'] = le_Island.fit_transform(data['Island'])
data['Stage'] = le_Stage.fit_transform(data['Stage'])
data['ClutchCompletion'] = le_ClutchCompletion.fit_transform(data['ClutchCompletion'])
data['Sex'] = le_Sex.fit_transform(data['Sex'])

data.head()

Unnamed: 0,Species,Region,Island,Stage,ClutchCompletion,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Sex,Delta15N,Delta13C
0,0,0,2,0,1,39.1,18.7,181.0,3750.0,2,,
1,0,0,2,0,1,39.5,17.4,186.0,3800.0,1,8.94956,-24.69454
2,0,0,2,0,1,40.3,18.0,195.0,3250.0,1,8.36821,-25.33302
3,0,0,2,0,1,,,,,3,,
4,0,0,2,0,1,36.7,19.3,193.0,3450.0,1,8.76651,-25.32426


# Filling Null values in the columns with the mean of the respective columns 

In [6]:
data.CulmenLength = data.CulmenLength.fillna(data.CulmenLength.mean())

data.CulmenDepth = data.CulmenDepth.fillna(data.CulmenDepth.mean())

data.FlipperLength = data.FlipperLength.fillna(data.FlipperLength.mean())

data.BodyMass = data.BodyMass.fillna(data.BodyMass.mean())

data.Delta15N = data.Delta15N.fillna(data.Delta15N.mean())

data.Delta13C = data.Delta13C.fillna(data.Delta13C.mean())

data.head()

Unnamed: 0,Species,Region,Island,Stage,ClutchCompletion,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Sex,Delta15N,Delta13C
0,0,0,2,0,1,39.1,18.7,181.0,3750.0,2,8.733382,-25.686292
1,0,0,2,0,1,39.5,17.4,186.0,3800.0,1,8.94956,-24.69454
2,0,0,2,0,1,40.3,18.0,195.0,3250.0,1,8.36821,-25.33302
3,0,0,2,0,1,43.92193,17.15117,200.915205,4201.754386,3,8.733382,-25.686292
4,0,0,2,0,1,36.7,19.3,193.0,3450.0,1,8.76651,-25.32426


# Dividing the data into dependent and independent variables

In [7]:
Y = data.Species
X = data.drop(columns = ['Species'])

# Splitting data into training data and testing data

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2)

In [9]:
from sklearn.preprocessing import StandardScaler
SS = StandardScaler()

X_train = SS.fit_transform(X_train)
X_test = SS.fit_transform(X_test)

In [10]:
from sklearn.svm import SVC

model = SVC()

In [11]:
model.fit(X_train, y_train)

SVC()

In [12]:
y_pred = model.predict(X_test)
y_pred

array([2, 2, 0, 1, 0, 0, 1, 0, 2, 0, 1, 0, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1,
       0, 1, 2, 0, 0, 2, 0, 2, 1, 1, 2, 2, 2, 0, 1, 0, 2, 1, 0, 1, 1, 0,
       2, 2, 0, 0, 1, 2, 0, 0, 0, 0, 2, 0, 0, 0, 0, 2, 1, 0, 1, 0, 1, 0,
       1, 2, 0])

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

acc, cm

(0.9855072463768116,
 array([[30,  0,  0],
        [ 1, 16,  0],
        [ 0,  0, 22]]))