# Data Analysis 101 - Pandas

Alex Chen

Source:

https://github.com/allisonhorst/palmerpenguins/tree/main

https://www.kaggle.com/datasets/parulpandey/palmer-archipelago-antarctica-penguin-data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 1. 讀入資料

In [2]:
df1 = pd.read_csv('penguins_info.csv')

In [3]:
df2 = pd.read_excel('penguins_measurements.xlsx')

# 2. 資料概覽

In [4]:
df1 # 直接輸入 df1 可檢視資料

Unnamed: 0,ID,Species,Island,Sex
0,1,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE
1,2,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
2,3,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
3,4,Adelie Penguin (Pygoscelis adeliae),Torgersen,
4,5,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
...,...,...,...,...
339,340,Gentoo penguin (Pygoscelis papua),Biscoe,
340,341,Gentoo penguin (Pygoscelis papua),Biscoe,FEMALE
341,342,Gentoo penguin (Pygoscelis papua),Biscoe,MALE
342,343,Gentoo penguin (Pygoscelis papua),Biscoe,FEMALE


In [5]:
df1.head() # 顯示前五筆資料

Unnamed: 0,ID,Species,Island,Sex
0,1,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE
1,2,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
2,3,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
3,4,Adelie Penguin (Pygoscelis adeliae),Torgersen,
4,5,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE


In [6]:
df1.shape # 顯示資料筆數與欄位數

(344, 4)

In [7]:
df1.columns # 顯示欄位名稱

Index(['ID', 'Species', 'Island', 'Sex'], dtype='object')

In [8]:
df1.describe() # 數值型資料的描述性統計

Unnamed: 0,ID
count,344.0
mean,172.5
std,99.448479
min,1.0
25%,86.75
50%,172.5
75%,258.25
max,344.0


In [9]:
df1.info() # 資料的資訊

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   ID       344 non-null    int64 
 1   Species  344 non-null    object
 2   Island   344 non-null    object
 3   Sex      334 non-null    object
dtypes: int64(1), object(3)
memory usage: 10.9+ KB


In [10]:
df1.dtypes # 欄位的資料型態

ID          int64
Species    object
Island     object
Sex        object
dtype: object

In [11]:
df1['Species'].value_counts() # 類別型資料的計數

Species
Adelie Penguin (Pygoscelis adeliae)          152
Gentoo penguin (Pygoscelis papua)            124
Chinstrap penguin (Pygoscelis antarctica)     68
Name: count, dtype: int64

In [12]:
df2.head()

Unnamed: 0,ID,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,39.1,18.7,181.0,3750.0
1,2,39.5,17.4,186.0,3800.0
2,3,40.3,18.0,195.0,3250.0
3,4,,,,
4,5,36.7,19.3,193.0,3450.0


In [13]:
df2.dtypes

ID                       int64
Culmen Length (mm)     float64
Culmen Depth (mm)      float64
Flipper Length (mm)    float64
Body Mass (g)          float64
dtype: object

In [14]:
df2['Body Mass (g)'].min()

np.float64(2700.0)

In [15]:
df2['Body Mass (g)'].max()

np.float64(6300.0)

In [16]:
df2['Body Mass (g)'].mean()

np.float64(4201.754385964912)

In [17]:
df2['Body Mass (g)'].std()

np.float64(801.9545356980956)

In [18]:
df2['Body Mass (g)'].median()

np.float64(4050.0)

# 3. 資料搜索

In [19]:
df2['Culmen Depth (mm)']

0      18.7
1      17.4
2      18.0
3       NaN
4      19.3
       ... 
339     NaN
340    14.3
341    15.7
342    14.8
343    16.1
Name: Culmen Depth (mm), Length: 344, dtype: float64

In [20]:
df2[['Culmen Depth (mm)', 'Body Mass (g)']]

Unnamed: 0,Culmen Depth (mm),Body Mass (g)
0,18.7,3750.0
1,17.4,3800.0
2,18.0,3250.0
3,,
4,19.3,3450.0
...,...,...
339,,
340,14.3,4850.0
341,15.7,5750.0
342,14.8,5200.0


In [21]:
df2.loc[1, 'Culmen Depth (mm)']

np.float64(17.4)

In [22]:
df2.loc[1, ['Culmen Depth (mm)', 'Body Mass (g)']]

Culmen Depth (mm)      17.4
Body Mass (g)        3800.0
Name: 1, dtype: float64

In [23]:
df2.loc[[1, 4], ['Culmen Depth (mm)', 'Body Mass (g)']]

Unnamed: 0,Culmen Depth (mm),Body Mass (g)
1,17.4,3800.0
4,19.3,3450.0


In [24]:
df2.loc[df2['Body Mass (g)'] >= 6000, ['Culmen Depth (mm)', 'Body Mass (g)']]

Unnamed: 0,Culmen Depth (mm),Body Mass (g)
237,15.2,6300.0
253,17.0,6050.0
297,16.3,6000.0
337,16.2,6000.0


In [25]:
df2.head()

Unnamed: 0,ID,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,39.1,18.7,181.0,3750.0
1,2,39.5,17.4,186.0,3800.0
2,3,40.3,18.0,195.0,3250.0
3,4,,,,
4,5,36.7,19.3,193.0,3450.0


In [26]:
df2.iloc[1, 2]

np.float64(17.4)

In [27]:
df2.iloc[1, [2, 4]]

Culmen Depth (mm)      17.4
Body Mass (g)        3800.0
Name: 1, dtype: float64

# 4. 資料清理與修改

In [28]:
df_temp = df2.drop(['Culmen Length (mm)', 'Culmen Depth (mm)'], axis=1)

In [29]:
df_temp.head()

Unnamed: 0,ID,Flipper Length (mm),Body Mass (g)
0,1,181.0,3750.0
1,2,186.0,3800.0
2,3,195.0,3250.0
3,4,,
4,5,193.0,3450.0


In [30]:
df_temp.loc[1, 'ID']

np.int64(2)

In [31]:
df_temp.loc[1, 'ID'] = 1

In [32]:
df_temp.head()

Unnamed: 0,ID,Flipper Length (mm),Body Mass (g)
0,1,181.0,3750.0
1,1,186.0,3800.0
2,3,195.0,3250.0
3,4,,
4,5,193.0,3450.0


In [33]:
df_temp.loc[1, ['Flipper Length (mm)', 'Body Mass (g)']] = [181.0, 3750.0]

In [34]:
df_temp.head()

Unnamed: 0,ID,Flipper Length (mm),Body Mass (g)
0,1,181.0,3750.0
1,1,181.0,3750.0
2,3,195.0,3250.0
3,4,,
4,5,193.0,3450.0


In [35]:
df_temp2 = df_temp.drop_duplicates()

In [36]:
df_temp2.head()

Unnamed: 0,ID,Flipper Length (mm),Body Mass (g)
0,1,181.0,3750.0
2,3,195.0,3250.0
3,4,,
4,5,193.0,3450.0
5,6,190.0,3650.0


In [37]:
print('清理前資料筆數:', df_temp.shape[0], '\n清理後資料筆數:', df_temp2.shape[0])

清理前資料筆數: 344 
清理後資料筆數: 343


In [38]:
df2.isna().any(axis=0)

ID                     False
Culmen Length (mm)      True
Culmen Depth (mm)       True
Flipper Length (mm)     True
Body Mass (g)           True
dtype: bool

In [39]:
df2.isna().sum(axis=0) # 欄位缺失值的計數

ID                     0
Culmen Length (mm)     2
Culmen Depth (mm)      2
Flipper Length (mm)    2
Body Mass (g)          2
dtype: int64

In [40]:
df2.loc[df2.isna().any(axis=1), :]

Unnamed: 0,ID,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
3,4,,,,
339,340,,,,


In [41]:
df2 = df2.dropna()

# 5. 資料合併

In [42]:
df1.head()

Unnamed: 0,ID,Species,Island,Sex
0,1,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE
1,2,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
2,3,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE
3,4,Adelie Penguin (Pygoscelis adeliae),Torgersen,
4,5,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE


In [46]:
df1.shape

(344, 4)

In [43]:
df2.head()

Unnamed: 0,ID,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,39.1,18.7,181.0,3750.0
1,2,39.5,17.4,186.0,3800.0
2,3,40.3,18.0,195.0,3250.0
4,5,36.7,19.3,193.0,3450.0
5,6,39.3,20.6,190.0,3650.0


In [47]:
df2.shape

(342, 5)

In [44]:
df_m1 = pd.merge(df1, df2, on='ID')

In [48]:
df_m1.head()

Unnamed: 0,ID,Species,Island,Sex,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE,39.1,18.7,181.0,3750.0
1,2,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,39.5,17.4,186.0,3800.0
2,3,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,40.3,18.0,195.0,3250.0
3,5,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,36.7,19.3,193.0,3450.0
4,6,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE,39.3,20.6,190.0,3650.0


In [49]:
df_m1.shape

(342, 8)

In [50]:
df_m2 = pd.merge(df1, df2, on='ID', how='left')

In [51]:
df_m2.head()

Unnamed: 0,ID,Species,Island,Sex,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE,39.1,18.7,181.0,3750.0
1,2,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,39.5,17.4,186.0,3800.0
2,3,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,40.3,18.0,195.0,3250.0
3,4,Adelie Penguin (Pygoscelis adeliae),Torgersen,,,,,
4,5,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,36.7,19.3,193.0,3450.0


In [52]:
df_m2.shape

(344, 8)

In [55]:
df_m3 = pd.concat([df1, df2], axis=1)

In [56]:
df_m3.head()

Unnamed: 0,ID,Species,Island,Sex,ID.1,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g)
0,1,Adelie Penguin (Pygoscelis adeliae),Torgersen,MALE,1.0,39.1,18.7,181.0,3750.0
1,2,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,2.0,39.5,17.4,186.0,3800.0
2,3,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,3.0,40.3,18.0,195.0,3250.0
3,4,Adelie Penguin (Pygoscelis adeliae),Torgersen,,,,,,
4,5,Adelie Penguin (Pygoscelis adeliae),Torgersen,FEMALE,5.0,36.7,19.3,193.0,3450.0


In [57]:
df_m3.shape

(344, 9)

In [58]:
df = pd.merge(df1, df2, on='ID')

# 6. 資料聚合