In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount('/content/drive')
# 此步驟需要 google 權限認證
%matplotlib inline

## 載入第二部分使用到的資料集

In [5]:
# 載入範例的資料集，這次我們使用的是鐵達尼號的資料，後續在機器學習領域上會很常見到這份資料集, 將資料集指到 google drive 底下 My Drive/Python-Data-Analysis-master/dataset/titanic/train.csv, 如有更動此預設位子請在自行調整
df = pd.read_csv('/content/drive/My Drive/Python-Data-Analysis-master/dataset/titanic/train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## 在DataFrame中進行數值統計

在資料分析的過程中，我們會希望將整份資料或特定欄位做一個整體趨勢的說明，而不是直接把所有資料湊到別人的鼻子下請他們自己看，如何用簡單的方式描述資料我們得利用到統計學上的一些統計量。別擔心，很多其實我們應該已經耳熟能詳了，更進階的統計概念我們也會在之後的統計學單元傳授給大家。現在就讓我們先來看一下如何對dataframe這類的資料做數值統計吧。

- ### 加總、計數、以及其他統計量

如同numpy中的np.sum(), np.mean()，在Pandas中我們也使用類似的方法做計算。

In [6]:
# 先觀察一下前五筆資料
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
# 計算Survived的平均次數
df['Survived'].mean()

0.3838383838383838

In [8]:
# 計算Fare的最大值
df['Fare'].max()

512.3292

In [9]:
# 若欄位是字串之類的類別變項，我們可以使用.value_counts()觀察各個類別的出現次數
df['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [10]:
# 若不想一個一個欄位觀察，直接使用df.mean()會呈現所有pandas覺得可以做平均數的欄位平均
# 需要注意的是有些以數值做編碼的類別變項(ex. Pclass)也會被當作連續變項來看待因而納入
df.mean()

PassengerId    446.000000
Survived         0.383838
Pclass           2.308642
Age             29.699118
SibSp            0.523008
Parch            0.381594
Fare            32.204208
dtype: float64

In [11]:
# 更簡單的做法是使用df.describe()，pandas會自動將特定類型的欄位做一個摘要
# 在連續型欄位上會呈現較常用的一些統計量
df.describe(include='number')

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [12]:
# 在類別型欄位會呈現計數、有幾個類別、以及出現最多次類別的頻率等資訊
df.describe(include='object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Niskanen, Mr. Juha",male,1601,C23 C25 C27,S
freq,1,577,7,4,644
