# UPA - project 2 - Dataset for data mining 2
authors: xkryst02, xkrusi01, xseipe00

year: 2022/23

In [None]:
import config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

NUMERICS = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

rawDf = pd.read_csv(config.RAW_DATA_PATH)

# Data frame for exploratory analysis
df = rawDf.copy()

## Irrelevant attributes

In [None]:
# Delete irrelevant
df = df.drop(columns=['Comments', 'Clutch Completion', 'studyName', 'Sample Number', 'Stage', 'Individual ID', 'Region'])

## Missing values - value fill

In [None]:
#Show missing values
print(df.isnull().sum())

In [None]:
#Replace invalid value '.'
df.Sex.replace('.', np.nan, inplace=True)

print(df.isnull().sum())

In [None]:
#Fill missing numerical values with mean
for i in df.columns[df.dtypes == float]:
    df[i].fillna(df[i].mean(), inplace=True)

print(df.isnull().sum())

In [None]:
#Remove missing categorical values
df = df.dropna(subset=["Sex"]) 

print(df.isnull().sum())

## Handle outliers

In [None]:
# Our dataset doesn't have any outliers, but if it had some, it would delete them with this 
# Based on abs(z-score) > 3 

oldDf = df

numericOnly = df.select_dtypes(include=NUMERICS)
for column in numericOnly.columns:
    wrong = numericOnly[(np.abs(stats.zscore(numericOnly[column])) > 3)]
    df = pd.merge(df,wrong, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)

# Number of deleted records
oldDf.shape[0] - df.shape[0]

## Convert categoric attributes to numeric

In [None]:
#Show data types
print(df.dtypes)

In [None]:
#Show data before conversion
display(df)

In [None]:
#Convert categoric attributes to numeric with method 'pd.factorize'
df["Species"], species_labels = pd.factorize(df["Species"])
df["Island"] = pd.factorize(df["Island"])[0]

#Convert categoric attributes to numeric manually with method 'replace'
df["Sex"].replace(["FEMALE", "MALE"], [0, 1], inplace=True)
sex_labels = ["Female", "Male"]

#Convert date to datetime format
df["Date Egg"] = pd.to_datetime(df["Date Egg"], format="%m/%d/%y")

#Show data types after conversion
print(df.dtypes)

In [None]:
#Show data after conversion
display(df)

In [None]:
#Plot data with category names they had before conversion
ax = sns.violinplot(data = df, x = "Species", y = "Body Mass (g)", hue="Sex", split=True)
ax.set_xticklabels(species_labels, rotation=-10)
handles, _ = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=sex_labels)

### Normalize where adequate

In [None]:
# Columns that are adequate
columns = ['Culmen Length (mm)', 'Flipper Length (mm)', 'Culmen Depth (mm)', 'Body Mass (g)', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)']

# Normalize
for column in columns:
    df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

# View normalized data
display('MIN:',df.min())
display('MAX:', df.max())
df