<a href="https://colab.research.google.com/github/WENKAITAN/2021-fall-data-science/blob/main/Penguins_WenkaiTan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras
# Layers for our neural networks
from tensorflow.keras.layers import Dense


# Our normal python data science stack you've come to know and love
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split


print(tf.__version__)

2.8.0


In [2]:
# Download the dataset
!wget -q https://storage.googleapis.com/download.tensorflow.org/data/palmer_penguins/penguins.csv -O /tmp/penguins.csv

# Load a dataset into a Pandas Dataframe.
dataset_df = pd.read_csv("/tmp/penguins.csv")

# Display the first 3 examples.
dataset_df.head(5)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [3]:
#check if dataset has null values
dataset_df.isna().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64

In [4]:
# delete all the rows that have null val
dataset_df = dataset_df.dropna()

In [5]:
# sanity check
dataset_df.isna().sum()

species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64

In [6]:
# one-hot encoding
dataset_df = pd.get_dummies(dataset_df, columns=['island', 'sex'], drop_first=True)
dataset_df.head()

Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,island_Dream,island_Torgersen,sex_male
0,Adelie,39.1,18.7,181.0,3750.0,2007,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,2007,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,2007,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,2007,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,2007,0,1,1


In [7]:
# Name of the label column, and convert the categorical label into an integer.
label = "species"
classes = dataset_df[label].unique().tolist()
print(f"Label classes: {classes}")

dataset_df[label] = dataset_df[label].map(classes.index)
dataset_df.head()

Label classes: ['Adelie', 'Gentoo', 'Chinstrap']


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,island_Dream,island_Torgersen,sex_male
0,0,39.1,18.7,181.0,3750.0,2007,0,1,1
1,0,39.5,17.4,186.0,3800.0,2007,0,1,0
2,0,40.3,18.0,195.0,3250.0,2007,0,1,0
4,0,36.7,19.3,193.0,3450.0,2007,0,1,0
5,0,39.3,20.6,190.0,3650.0,2007,0,1,1


In [8]:
# dataset with all the features
X = dataset_df.drop(columns='species')
#X.head()

# dataset with labels
y = dataset_df['species']
#y.head()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=45)

print('Lenght of our Training data:', X_train.shape, '\nLength of our Testing data:', y_test.shape)
dataset_df.head()

Lenght of our Training data: (266, 8) 
Length of our Testing data: (67,)


Unnamed: 0,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,year,island_Dream,island_Torgersen,sex_male
0,0,39.1,18.7,181.0,3750.0,2007,0,1,1
1,0,39.5,17.4,186.0,3800.0,2007,0,1,0
2,0,40.3,18.0,195.0,3250.0,2007,0,1,0
4,0,36.7,19.3,193.0,3450.0,2007,0,1,0
5,0,39.3,20.6,190.0,3650.0,2007,0,1,1


In [9]:
# to see how many kinds of species the dataset has
dataset_df['species'].value_counts()

0    146
1    119
2     68
Name: species, dtype: int64

In [10]:
# building the neurons network with sequential function
# this neuron networks is made of 3 layers
model = tf.keras.models.Sequential(
  [                
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(3, activation='softmax')
]
)

In [12]:
# Let's introduce a function that measures the prediction error.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

In [13]:
# "compile" the model before training it. 
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=loss_fn, metrics=['acc'])

In [17]:
epochs = 15
model.fit(X_train, y_train, epochs=epochs, validation_split=0.1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x7f1ab2d6dc50>

In [15]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1152      
                                                                 
 dense_1 (Dense)             (None, 128)               16512     
                                                                 
 dense_2 (Dense)             (None, 3)                 387       
                                                                 
Total params: 18,051
Trainable params: 18,051
Non-trainable params: 0
_________________________________________________________________


In [16]:
# now we evaluate our model
model.evaluate(X_test, y_test)



[2.782670497894287, 0.5970149040222168]

## The result does not look good compare with tf decision tree from tutorial. 
This is because neural networks are very data hungry. With less than 1,000 examples, our titanic data set is probably not big enough. There is no strict number for the amount of data you need, but at least 10,000 examples is a good bet, 100,000 is much better, and the best models use training data with examples in the millions.

We usually don't use Neural Networks for traditional data sets like the titanic data set. They are most useful on image recognition or NLP problems, so let's move on to image recognition.