<a href="https://colab.research.google.com/github/GGlivePh/QG/blob/main/Lab03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/GGlivePh/QG.git

Cloning into 'QG'...
remote: Enumerating objects: 64, done.[K
remote: Counting objects: 100% (64/64), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 64 (delta 27), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (64/64), 150.10 KiB | 1.10 MiB/s, done.
Resolving deltas: 100% (27/27), done.


In [2]:
import os
os.chdir('QG')

# <center> Advanced Digital Agriculture (DS/AS 875) <center> Module 01 - Data Analysis (Lab 03)
***
# Table of Contents
* [READING THE DATASET INTO PYTHON](#READING-THE-DATASET-INTO-PYTHON)
* [DATA EDITING](#DATA-EDITING)
    * [Creating dummy variables (One-Hot Encode)](#Creating-dummy-variables-(One-Hot-Encode))
    * [Creating training and testing datasets](#Creating-training-and-testing-datasets)
* [NEURAL NETWORK](#NEURAL-NETWORK)
    * [Data pre-processing](#Data-pre-processing)
    * [Training (GridSearch)](#Training-(GridSearch))
    * [Testing](#Testing)
***

## READING THE DATASET INTO PYTHON

In [3]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
cowdata = pd.read_csv("CullDairyCow_Data.csv")
cowdata

Unnamed: 0,ID,season,lact,calvingEase,lameness,mastitis,reason,lactStage,305ME,price,BW
0,129,1,5,1,0,2,7-Mastitis and Udder,Mid Lactation,14841.5,53.5,1785
1,209,1,4,1,0,1,3-Low production,Late Lactation,14909.6,66.0,1835
2,267,1,5,1,0,2,3-Low production,Late Lactation,12764.1,54.0,1965
3,299,1,7,1,2,0,3-Low production,Mid Lactation,10645.8,43.5,1785
4,341,1,6,2,0,1,3-Low production,Mid Lactation,10704.8,47.0,1640
...,...,...,...,...,...,...,...,...,...,...,...
395,11625,1,1,2,2,0,3-Low production,Late Lactation,10146.9,48.5,1255
396,11697,1,1,1,3,0,3-Low production,Late Lactation,8568.4,52.5,1380
397,11765,1,1,1,3,0,3-Low production,Late Lactation,4948.7,54.5,1475
398,11824,1,1,1,0,0,3-Low production,Mid Lactation,7452.5,42.0,1195


## DATA EDITING

In [4]:
# Frequency for number of lactations
cowdata["lact"].value_counts(sort=True, ascending=False)

Unnamed: 0_level_0,count
lact,Unnamed: 1_level_1
2,134
3,80
1,79
4,55
5,28
6,15
7,7
8,2


In [5]:
# Lactation number 6 or higher lumped together into a single class (6)
cowdata.loc[cowdata.lact > 6, "lact"] = 6

In [6]:
# Frequency for number of lamness cases
cowdata["lameness"].value_counts(sort=True, ascending=False)

Unnamed: 0_level_0,count
lameness,Unnamed: 1_level_1
0,255
1,85
2,27
4,19
3,10
5,2
6,2


In [7]:
# Number of lamness cases larger than 4 lumped together into a single class (4)
cowdata.loc[cowdata.lameness > 4, "lameness"] = 4

In [None]:
# Frequency for number of lamness cases
cowdata["mastitis"].value_counts(sort=True, ascending=False)

In [None]:
# a Number of mastitis cases larger than 4 lumped together into a single class (4)
cowdata.loc[cowdata.mastitis > 4, "mastitis"] = 4

## Creating dummy variables (One-Hot Encode)

In [None]:
cowdata['reason'].value_counts(sort=True, ascending=False)

In [None]:
# Defining dummy variables
cowdata_oh = pd.get_dummies(cowdata, columns=["season", "lact", "calvingEase", "lameness", "mastitis", "reason", "lactStage"], drop_first=False)
cowdata_oh

## Creating training and testing datasets

In [None]:
# Adding continuous variables (305ME and BW) to the 30 dummy columns
x = pd.concat([cowdata_oh.loc[:,'305ME'], cowdata_oh.loc[:,'BW':]], axis=1)
x

In [None]:
print(x.columns.tolist())

In [None]:
# Center and Standardize all features (force mean=0 and standard deviation=1)
x_std = pd.concat([cowdata_oh.loc[:,'305ME'], cowdata_oh.loc[:,'BW':]], axis=1)
x_std = (x_std - x_std.mean()) / x_std.std() # Subtract mean and divide by standard deviation
x_std

In [None]:
# Response variable vector
y = cowdata_oh[["price"]]
y

In [None]:
# Split the data set into training (70%) and testing (30%)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_std, y, test_size=0.30, shuffle=True, random_state=40)
print("N. samples training %s , N. samples testing %s" % (x_train.shape[0], x_test.shape[0]))

# NEURAL NETWORKS

## Data pre-processing

In [None]:
# Response variable vector
y = cowdata_oh[["price"]]
y

##  Training (GridSearch)

In [None]:
# GridSearch using k-fold cross-validation
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
nfolds = 3
metrics = ('r2', 'neg_mean_absolute_error', 'neg_root_mean_squared_error')
parameters = {'activation':('relu', 'tanh'),
              'hidden_layer_sizes': [(100,80),(120,100),(180,120)],
              'random_state':[40]}

nn = MLPRegressor()
nn = GridSearchCV(nn, parameters, scoring=metrics, cv=nfolds, refit = metrics[2], return_train_score = True)
nn.fit(x_train, y_train)

In [None]:
# Matrics - training
pd.DataFrame({'Activation': nn.cv_results_["param_activation"],
              'Layer Sizes': nn.cv_results_['param_hidden_layer_sizes'],
              'R2': nn.cv_results_["mean_test_r2"],
              'MAE': abs(nn.cv_results_['mean_test_neg_mean_absolute_error']),
              'RMSE': abs(nn.cv_results_['mean_test_neg_root_mean_squared_error'])})

## Testing

In [None]:
# Print best activation function, layer size, and RMSE, and testing the best model using the test set
print("Best Activation: %s, Layer Size: %s, RMSE: %.6f" % (nn.best_params_['activation'], nn.best_params_['hidden_layer_sizes'], abs(nn.best_score_)))
ypred = nn.best_estimator_.predict(x_test)

In [None]:
# Scatter plot - predicted and observed
import matplotlib.pyplot as plt
import numpy as np
ypred = np.ravel(ypred)
plt.plot(ypred, y_test, 'o')
m, b = np.polyfit(ypred, y_test, 1)
plt.plot(ypred, m*ypred + b)

In [None]:
# Metrics - testing
from sklearn.metrics import *

pd.DataFrame({'Activation': [nn.best_params_['activation']],
              'Layer Size': [nn.best_params_['hidden_layer_sizes']],
              'R2': r2_score(y_test, ypred),
              'MAE': [abs(mean_absolute_error(y_test, ypred))],
              'RMSE': [abs(mean_squared_error(y_test, ypred, squared=False))]})