In [174]:
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import optimizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
import os, shutil
import numpy as np


In [175]:
df = pd.read_csv("C:\\Users\\david\\OneDrive\\Inholland\\DeepLearning\\Assignment 1\\palmerpenguins_original.csv")


# 1 Preprocessing 
Firstly the data must be analysed and cleaned.
Thereafter, to decide what variables to use as predictors we must employ descriptive statistics to analyse what variables are relevant.
# 1.1 Data analysis


In [176]:
# Check for null values in each column and sum them up
null_counts = df.isnull().sum()

# Print the count of null values for each column
print("Null value counts for each column:")
print(null_counts)

# Find rows with null values
rows_with_null = df[df.isnull().any(axis=1)]

# Display rows with null values
print("Rows with null values:")
print(rows_with_null)

# Print the shape of the DataFrame
print("Shape of the DataFrame:")
print(df.shape)

Null value counts for each column:
species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
year                  0
dtype: int64
Rows with null values:
    species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
3    Adelie  Torgersen             NaN            NaN                NaN   
8    Adelie  Torgersen            34.1           18.1              193.0   
9    Adelie  Torgersen            42.0           20.2              190.0   
10   Adelie  Torgersen            37.8           17.1              186.0   
11   Adelie  Torgersen            37.8           17.3              180.0   
47   Adelie      Dream            37.5           18.9              179.0   
178  Gentoo     Biscoe            44.5           14.3              216.0   
218  Gentoo     Biscoe            46.2           14.4              214.0   
256  Gentoo     Biscoe            47.3           13.8

# 1.2 Data Cleaning and encoding
Given the results of the null values the decision I decided was to drop the null rows which contain no body_mass as that is the target variable. Furthermore, I decided to add sex to the data based on the mode to compensate for the null values.
To do certain statistics on the categorical data the data must be encoded. With Pandas get_dummies uses "One-Hot encoding" to encode the categorical data.

In [177]:
df.dropna(subset=["body_mass_g"], inplace=True)
df["sex"].fillna(df["sex"].mode()[0], inplace=True)
print(df.isnull().sum())

encoded_values = pd.get_dummies(df, columns=["species", "island","sex","year"], drop_first=True)

print(encoded_values)


species              0
island               0
bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
year                 0
dtype: int64
     bill_length_mm  bill_depth_mm  flipper_length_mm  body_mass_g  \
0              39.1           18.7              181.0       3750.0   
1              39.5           17.4              186.0       3800.0   
2              40.3           18.0              195.0       3250.0   
4              36.7           19.3              193.0       3450.0   
5              39.3           20.6              190.0       3650.0   
..              ...            ...                ...          ...   
339            55.8           19.8              207.0       4000.0   
340            43.5           18.1              202.0       3400.0   
341            49.6           18.2              193.0       3775.0   
342            50.8           19.0              210.0       4100.0   
343            50.2           18.

# 1.3 Data Statistics 

In [178]:
stats_bill_length = df['bill_length_mm'].describe()
stats_bill_depth = df['bill_depth_mm'].describe()
stats_flipper_length = df['flipper_length_mm'].describe()

#target variable 
stats_body_mass = df['body_mass_g'].describe()

print(stats_bill_length,2)
print(stats_bill_depth,2)
print(stats_flipper_length,2)

print(stats_body_mass,2)

species_count = df['species'].value_counts()
island_count = df['island'].value_counts()
sex_count = df['sex'].value_counts()
year_count = df['year'].value_counts()

print(species_count)
print(island_count)
print(sex_count)
print(year_count)

count    342.000000
mean      43.921930
std        5.459584
min       32.100000
25%       39.225000
50%       44.450000
75%       48.500000
max       59.600000
Name: bill_length_mm, dtype: float64 2
count    342.000000
mean      17.151170
std        1.974793
min       13.100000
25%       15.600000
50%       17.300000
75%       18.700000
max       21.500000
Name: bill_depth_mm, dtype: float64 2
count    342.000000
mean     200.915205
std       14.061714
min      172.000000
25%      190.000000
50%      197.000000
75%      213.000000
max      231.000000
Name: flipper_length_mm, dtype: float64 2
count     342.000000
mean     4201.754386
std       801.954536
min      2700.000000
25%      3550.000000
50%      4050.000000
75%      4750.000000
max      6300.000000
Name: body_mass_g, dtype: float64 2
Adelie       151
Gentoo       123
Chinstrap     68
Name: species, dtype: int64
Biscoe       167
Dream        124
Torgersen     51
Name: island, dtype: int64
male      177
female    165
Name: sex, d

In [179]:
# Calculate correlation coefficients between target variable and predictor variables
correlation_matrix = encoded_values.corr()

# Extract correlation coefficients for the target variable
target_correlation = correlation_matrix['body_mass_g']

# Print correlation coefficients
print("Correlation coefficients between target variable and predictor variables:")
print(target_correlation)

Correlation coefficients between target variable and predictor variables:
bill_length_mm       0.595110
bill_depth_mm       -0.471916
flipper_length_mm    0.871202
body_mass_g          1.000000
species_Chinstrap   -0.291561
species_Gentoo       0.818198
island_Dream        -0.460411
island_Torgersen    -0.258979
sex_male             0.409315
year_2008            0.057319
year_2009            0.007790
Name: body_mass_g, dtype: float64


Given these results it is concluded that flipper length,bill_length and the gentoo species, has a high positive correlation. Meanwhile, the bill_depth and dream island pinguins give a negative correlation. it is also important to note that the sex has a slight correlation to the weight. Because of this the predictor variable that will be selected is flipper length, bill length and species. 

# 1.4 Splitting Dataset

In [180]:
# Selecting predictor variables ('species', 'bill_length_mm', 'flipper_length_mm') and target variable ('body_mass_g')
X = df[['species', 'bill_length_mm', 'flipper_length_mm']]
y = df['body_mass_g']

# One-hot encode categorical variable 'species'
X_encoded = pd.get_dummies(X, columns=['species'], drop_first=True)

print("Predictor variables (X):")
print(X_encoded.head())
print(f"Gentoo type: {X_encoded['species_Gentoo'].dtype}\n"
      f"Chinstrap type: {X_encoded['species_Chinstrap'].dtype}\n"
      f"bill_length type: {X_encoded['bill_length_mm'].dtype}\n"
      f"flipper_legth: {X_encoded['flipper_length_mm'].dtype}")

print("\nTarget variable (y):")
print(y.head())


# Split the dataset into 60% training 20% validation and 20% testing
X_train, X_split, y_train, y_split = train_test_split(X_encoded, y, test_size=0.4, random_state=42)
X_validation,X_test,y_validation,y_test = train_test_split(X_split,y_split,test_size=.5)

# Display the shapes of the training and testing sets
print("Training set - Predictor variables:", X_train.shape)
print("Training set - Target variable:", y_train.shape)
print("Testing set - Predictor variables:", X_test.shape)
print("Testing set - Target variable:", y_test.shape)
print("Validation set - Predictor variables:", X_validation.shape)
print("Validation set - Target variable:", y_validation.shape)

Predictor variables (X):
   bill_length_mm  flipper_length_mm  species_Chinstrap  species_Gentoo
0            39.1              181.0                  0               0
1            39.5              186.0                  0               0
2            40.3              195.0                  0               0
4            36.7              193.0                  0               0
5            39.3              190.0                  0               0
Gentoo type: uint8
Chinstrap type: uint8
bill_length type: float64
flipper_legth: float64

Target variable (y):
0    3750.0
1    3800.0
2    3250.0
4    3450.0
5    3650.0
Name: body_mass_g, dtype: float64
Training set - Predictor variables: (205, 4)
Training set - Target variable: (205,)
Testing set - Predictor variables: (69, 4)
Testing set - Target variable: (69,)
Validation set - Predictor variables: (68, 4)
Validation set - Target variable: (68,)


# 2 A custom neural network


# Activation Function

In [181]:
def relu(x):
    return np.maximum(0,x)

def dydxrelu(x):
    dx = np.where(x > 0, 1, 0)
    return dx
    

# Loss
y= true value                                                 
y_hat = predicted value

In [182]:
def square_loss(y, y_hat):
        return np.mean((y - y_hat) ** 2)

def square_loss_derivative(y, y_hat):
    return 2 * (y_hat - y)

# Network code

In [202]:
# Neural Network Initialization
def initialize_weights(layers):
    network = []
    for i in range(len(layers) - 1):
        weights = np.random.uniform(-1, 1, size=(layers[i], layers[i+1]))
        biases = np.random.uniform(-1, 1, size=layers[i+1])
        layer = {'weights': weights, 'biases': biases}
        network.append(layer)
    return network

# Forward Pass
def forward_pass(network, inputs):
    activations = [inputs]
    weighted_sums = []
    for layer in network:
        weighted_sum = np.dot(inputs, layer['weights']) + layer['biases']
        activation = relu(weighted_sum)
        weighted_sums.append(weighted_sum)
        activations.append(activation)
        inputs = activation
    return activations, weighted_sums

# Backpropagation
def backpropagation(network, activations, weighted_sums, targets, learning_rate):
    output_gradient = square_loss_derivative(targets, activations[-1])
    for i in range(len(network) - 1, -1, -1):
        if i == len(network) - 1:
            network[i]['weights'] += learning_rate * np.outer(activations[i], output_gradient)
            network[i]['biases'] += learning_rate * output_gradient.reshape(-1,)  # Reshape to match bias shape
        else:
            hidden_error = np.dot(network[i + 1]['weights'], output_gradient)
            hidden_gradient = hidden_error * dydxrelu(weighted_sums[i])
            network[i]['weights'] += learning_rate * np.outer(activations[i], hidden_gradient)
            network[i]['biases'] += learning_rate * hidden_gradient
            output_gradient = hidden_gradient

# Training the Model
def train_model(network, X_train, y_train, learning_rate, epochs):
    for epoch in range(epochs):
        total_loss = 0
        for inputs, target in zip(X_train.values, y_train.values):
            inputs = inputs.reshape(1, -1)
            target = target.reshape(1, -1)
            activations, weighted_sums = forward_pass(network, inputs)
            predicted_output = activations[-1]
            loss = square_loss(target, predicted_output)
            total_loss += loss
            backpropagation(network, activations, weighted_sums, target, learning_rate)
        avg_loss = total_loss / len(X_train)
        print(f"Epoch {epoch+1}/{epochs}, Average Loss: {avg_loss}")
    return network

# Evaluation
def evaluate(network, X, y):
    predictions = []
    for inputs, target in zip(X.values, y.values):
        inputs = inputs.reshape(1, -1)
        activations, _ = forward_pass(network, inputs)
        predictions.append(activations[-1])
    return predictions

# Train

In [206]:
# Initialize the Neural Network
network = initialize_weights([X_encoded.shape[1], 1])  # Output layer with 1 neuron

# Train the Model
learning_rate = 0.01
epochs = 100
network = train_model(network, X_train, y_train, learning_rate, epochs)

# Validation Set Evaluation
predictions_validation = evaluate(network, X_validation, y_validation)
loss_validation = square_loss(y_validation.values.reshape(-1, 1), np.array(predictions_validation).reshape(-1, 1))
print("Validation Set Loss:", loss_validation)

# Test Set Evaluation
predictions_test = evaluate(network, X_test, y_test)
loss_test = square_loss(y_test.values.reshape(-1, 1), np.array(predictions_test).reshape(-1, 1))
print("Test Set Loss:", loss_test)

Epoch 1/100, Average Loss: 18402981.707317073
Epoch 2/100, Average Loss: 18402981.707317073
Epoch 3/100, Average Loss: 18402981.707317073
Epoch 4/100, Average Loss: 18402981.707317073
Epoch 5/100, Average Loss: 18402981.707317073
Epoch 6/100, Average Loss: 18402981.707317073
Epoch 7/100, Average Loss: 18402981.707317073
Epoch 8/100, Average Loss: 18402981.707317073
Epoch 9/100, Average Loss: 18402981.707317073
Epoch 10/100, Average Loss: 18402981.707317073
Epoch 11/100, Average Loss: 18402981.707317073
Epoch 12/100, Average Loss: 18402981.707317073
Epoch 13/100, Average Loss: 18402981.707317073
Epoch 14/100, Average Loss: 18402981.707317073
Epoch 15/100, Average Loss: 18402981.707317073
Epoch 16/100, Average Loss: 18402981.707317073
Epoch 17/100, Average Loss: 18402981.707317073
Epoch 18/100, Average Loss: 18402981.707317073
Epoch 19/100, Average Loss: 18402981.707317073
Epoch 20/100, Average Loss: 18402981.707317073
Epoch 21/100, Average Loss: 18402981.707317073
Epoch 22/100, Average 