# Data Analysis 101 - Data Preparation

Alex Chen

Source:

https://github.com/allisonhorst/palmerpenguins/tree/main

https://www.kaggle.com/datasets/parulpandey/palmer-archipelago-antarctica-penguin-data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('penguins.csv')

In [3]:
df.columns

Index(['studyName', 'Sample Number', 'Species', 'Region', 'Island', 'Stage',
       'Individual ID', 'Clutch Completion', 'Date Egg', 'Culmen Length (mm)',
       'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)', 'Sex',
       'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)', 'Comments'],
      dtype='object')

In [4]:
df.drop(['studyName', 'Region', 'Stage', 'Individual ID', 'Date Egg', 'Comments', 'Delta 15 N (o/oo)', 'Delta 13 C (o/oo)'], axis=1, inplace=True)

In [5]:
df['ID'] = list(range(1, len(df)+1))

In [6]:
df['Species'] = df['Species'].map({'Adelie Penguin (Pygoscelis adeliae)': 'Adelie Penguin', 
                    'Chinstrap penguin (Pygoscelis antarctica)': 'Chinstrap Penguin', 
                    'Gentoo penguin (Pygoscelis papua)': 'Gentoo Penguin'})

In [7]:
df1 = df[['ID', 'Species', 'Island', 'Sex']]
df2 = df[['ID', 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)']]

In [8]:
df1

Unnamed: 0,ID,Species,Island,Sex
0,1,Adelie Penguin,Torgersen,MALE
1,2,Adelie Penguin,Torgersen,FEMALE
2,3,Adelie Penguin,Torgersen,FEMALE
3,4,Adelie Penguin,Torgersen,
4,5,Adelie Penguin,Torgersen,FEMALE
...,...,...,...,...
339,340,Gentoo Penguin,Biscoe,
340,341,Gentoo Penguin,Biscoe,FEMALE
341,342,Gentoo Penguin,Biscoe,MALE
342,343,Gentoo Penguin,Biscoe,FEMALE


In [9]:
df2

Unnamed: 0,ID,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,39.1,18.7,181.0,3750.0
1,2,39.5,17.4,186.0,3800.0
2,3,40.3,18.0,195.0,3250.0
3,4,,,,
4,5,36.7,19.3,193.0,3450.0
...,...,...,...,...,...
339,340,,,,
340,341,46.8,14.3,215.0,4850.0
341,342,50.4,15.7,222.0,5750.0
342,343,45.2,14.8,212.0,5200.0


In [10]:
# df1.to_csv('penguins_info.csv', index=False)
# df2.to_excel('penguins_measurements.xlsx', index=False)

In [11]:
test = df.copy()

In [12]:
test.drop(['Sample Number', 'ID'], axis=1, inplace=True)

In [13]:
test.dropna(inplace=True, axis=0)

In [14]:
test = test.sample(100, random_state=24).reset_index(drop=True)

In [15]:
test.loc[[5, 10, 15, 20], :] = test.loc[88, :].values

In [16]:
test.loc[[34, 56], 'Body Mass (g)'] = np.nan
test.loc[[67, 69, 73], 'Culmen Length (mm)'] = np.nan
test.loc[89, 'Flipper Length (mm)'] = np.nan

In [17]:
test

Unnamed: 0,Species,Island,Clutch Completion,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex
0,Gentoo Penguin,Biscoe,Yes,46.5,14.5,213.0,4400.0,FEMALE
1,Chinstrap Penguin,Dream,Yes,40.9,16.6,187.0,3200.0,FEMALE
2,Adelie Penguin,Dream,Yes,39.5,16.7,178.0,3250.0,FEMALE
3,Gentoo Penguin,Biscoe,Yes,42.9,13.1,215.0,5000.0,FEMALE
4,Gentoo Penguin,Biscoe,Yes,45.3,13.7,210.0,4300.0,FEMALE
...,...,...,...,...,...,...,...,...
95,Gentoo Penguin,Biscoe,Yes,49.3,15.7,217.0,5850.0,MALE
96,Gentoo Penguin,Biscoe,Yes,46.2,14.1,217.0,4375.0,FEMALE
97,Chinstrap Penguin,Dream,No,49.6,18.2,193.0,3775.0,MALE
98,Adelie Penguin,Dream,Yes,38.8,20.0,190.0,3950.0,MALE


In [18]:
# test.to_csv('penguins_test1.csv', index=False)

In [28]:
test2 = df.copy()

In [29]:
test2.dropna(inplace=True, axis=0)

In [30]:
test2 = test2.sample(200, random_state=24).reset_index(drop=True)

In [31]:
test2.drop(['Sample Number'], axis=1, inplace=True)

In [35]:
test2_common_ID = test2['ID'][:100].values

In [40]:
test2_1_partial_ID = test2['ID'][100:150].values
test2_2_partial_ID = test2['ID'][150:].values

In [41]:
test2_1_ID = np.concatenate([test2_common_ID, test2_1_partial_ID])
test2_2_ID = np.concatenate([test2_common_ID, test2_2_partial_ID])

In [47]:
test2_1 = test2[test2['ID'].isin(test2_1_ID)][['ID', 'Species', 'Island', 'Sex']]
test2_2 = test2[test2['ID'].isin(test2_2_ID)][['ID', 'Culmen Length (mm)', 'Culmen Depth (mm)', 'Flipper Length (mm)', 'Body Mass (g)']]

In [50]:
test2_1.to_csv('penguins_test2_1.csv', index=False)
test2_2.to_csv('penguins_test2_2.csv', index=False)