# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint
## Not for grading

## Learning Objective

The objective of this experiment is to understand Linear classifier.

## Dataset

### Description

The dataset consists of the below 7 columns,

- **species:** penguin species (Chinstrap, Adélie, or Gentoo)
- **culmen length & depth:** The culmen is the upper ridge of a bird's beak
- **flipper_length_mm:** flipper length
- **body_mass_g:** body mass
- **island:** island name (Dream, Torgersen, or Biscoe)
- **sex:** penguin sex

In [None]:
!wget -qq https://cdn.iiith.talentsprint.com/aiml/Experiment_related_data/Penguin.csv

### Importing Required Packages

In [None]:
from sklearn.linear_model import SGDClassifier
import pandas as pd

### Loading the data

In [None]:
dataset = "Penguin.csv"

data = pd.read_csv(dataset)

print(data.shape)

(344, 7)


In [None]:
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [None]:
# Count NaN values in each column of the dataframe
data.isna().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [None]:
# Print the unique() elements from the sex column 
data['sex'].unique()

array(['MALE', 'FEMALE', nan], dtype=object)

In [None]:
# Drop the records where sex column has NaN values
data.dropna(subset = ['sex'], inplace = True)

# Print the unique() elements from the sex column after dropping
print("Unique values after dropping NA values : ",data.sex.unique())

Unique values after dropping NA values :  ['MALE' 'FEMALE']


In [None]:
data['species'] = data['species'].replace(['Adelie','Chinstrap', 'Gentoo'],[0, 1, 2])
data['island'] = data['island'].replace(['Torgersen','Biscoe', 'Dream'],[2, 1, 0])
data['sex'] = data['sex'].replace(['MALE','FEMALE'],[1, 0])

In [None]:
data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,39.1,18.7,181.0,3750.0,1
1,0,2,39.5,17.4,186.0,3800.0,0
2,0,2,40.3,18.0,195.0,3250.0,0
4,0,2,36.7,19.3,193.0,3450.0,0
5,0,2,39.3,20.6,190.0,3650.0,1


### Storing the data and labels

In [None]:
X = data.drop(['species'], axis=1)
y = data['species']

### Splitting the data into train and test sets 

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Let us see the size of train and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((233, 6), (100, 6), (233,), (100,))

### Training a  Linear Classifier 

In [None]:
linear_classifier = SGDClassifier()

# Training or fitting the model with the train data
linear_classifier.fit(X_train, y_train)

# Testing the trained model
y_pred = linear_classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred, y_test))

0.41


In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_pred, y_test))

[[ 0  0  0]
 [15  7  0]
 [33 11 34]]


### Scaling the data

In [None]:
# Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scale1 = scaler.fit_transform(X_train)
X_test_scale1 = scaler.transform(X_test)

linear_classifier = SGDClassifier()

# Training or fitting the model with the train data
linear_classifier.fit(X_train_scale1, y_train)

# Testing the trained model
y_pred_scale = linear_classifier.predict(X_test_scale1)

print(accuracy_score(y_pred_scale, y_test))

1.0


In [None]:
print(confusion_matrix(y_pred_scale, y_test))

[[48  0  0]
 [ 0 18  0]
 [ 0  0 34]]


In [None]:
print((47+18+34)/100)

0.99


In [None]:
print(X_train_scale1)

[[ 0.38815646 -0.68859414  0.41178581 -1.01319331 -1.22511852 -0.99571733]
 [ 0.38815646  0.75514402 -0.77912631  1.44365316  1.10435628  1.0043011 ]
 [ 0.38815646  0.60514525 -1.29691419  0.79331144  0.64459151 -0.99571733]
 ...
 [ 0.38815646 -0.83859291  0.30822823 -0.57963217 -1.22511852 -0.99571733]
 [ 0.38815646  0.24889817 -1.08979904  1.37139297  0.98175234  1.0043011 ]
 [ 0.38815646 -1.13859045 -0.05422328 -1.44675445 -1.25576951 -0.99571733]]


In [None]:
print(linear_classifier.predict([[0.38815646,-0.68859414, 0.41178581, -1.01319331, -1.22511852, -0.99571733]]))

[0]
