# UPA - project 2 - Exploratory analysis
authors: xkryst02, xkrusi01, xseipe00

year: 2022/23

In [None]:
import config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

NUMERICS = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

rawDf = pd.read_csv(config.RAW_DATA_PATH)

# Data frame for exploratory analysis
df = rawDf.copy()


### Attribute analysis  

In [None]:
# Data types in dataset 
df.dtypes

In [None]:
# Minumum, maximum, mean, median and count of numeric attributes
df.describe(percentiles=[0.5])

In [None]:
# Count of records for all studies
df.studyName.value_counts().head()

In [None]:
# Count of penguins by species
df.Species.value_counts().head()

In [None]:
# Count of penguins by regions
df.Region.value_counts().head()
# NOTE: All penguins are from Anvers

In [None]:
# Count of penguins on islands
df.Island.value_counts().head()

In [None]:
# Count of penguins by stage
df.Stage.value_counts().head()
# NOTE: all penguins are adults

In [None]:
# Count of penguins by sex
df.Sex.value_counts().head()

In [None]:
# Remove false value in Sex attribute
df.Sex.replace('.', np.nan, inplace=True)
pass

In [None]:
# Dependency of sex to culmen length
sns.boxplot( x=df['Sex'], y=df['Culmen Length (mm)'])
plt.show()

In [None]:
# Dependency of sex to Flipper length
sns.violinplot( x=df['Sex'], y=df['Flipper Length (mm)'])
plt.show()

In [None]:
# Dependency of culmen depth to species and body mass
sns.jointplot(data=df, x="Body Mass (g)", y="Culmen Depth (mm)", hue=df.Species.apply(lambda x: x.split(' ', 1)[0]))
plt.show()

In [None]:
# Dependency of body mass to flipper length
sns.regplot(data = df, x = "Body Mass (g)", y = "Flipper Length (mm)")
plt.show()

In [None]:
# Dependency of body mass to species
sns.kdeplot(data = df, x = "Body Mass (g)", hue=df.Species.apply(lambda x: x.split(' ', 1)[0]), multiple='layer')
plt.show()

## Outliers

In [None]:
# Find outliers based on z-score
numericOnly = df.select_dtypes(include=NUMERICS).dropna()
for column in numericOnly.columns:
    print(numericOnly[(np.abs(stats.zscore(numericOnly[column])) > 3)].shape)
# Our data set doesn't have any outliers

## Analysis of missing values

In [None]:
# Number of record with at least one empty value
df.isna().sum().sum()

In [None]:
# Empty values in attributes
df.isna().sum()

In [None]:
# Count of empty values {number of empty values: record count}
counter = dict()
for a, b in df.isnull().iterrows():
    count = b.sum()
    if count in counter :
        counter[count] += 1
    else:
        counter[count] = 1

counter

## Corelation analysis

In [None]:
# Pearson correlation coefficient
pearson = df.corr(method='pearson', numeric_only=True)
pearson

In [None]:
# Spearman correlation coefficient
spearman = df.corr(method='spearman', numeric_only=True)
spearman

In [None]:
# Heat map based on pearson correlation coefficient
sns.heatmap(pearson, xticklabels=pearson.columns, yticklabels=pearson.columns)
plt.show()

In [None]:
# Heat map based on spearman correlation coefficient
sns.heatmap(spearman, xticklabels=spearman.columns, yticklabels=spearman.columns)
plt.show()