# 四則運算

In [None]:
import pandas as pd

s1 = pd.Series([1, 2, 3])
s2 = pd.Series([4, 5, 6])

# add()
print(s1.add(s2))

# sub()
print(s1.sub(s2))

0    5
1    7
2    9
dtype: int64
0   -3
1   -3
2   -3
dtype: int64


In [5]:
df1 = pd.DataFrame({
    "a":[10, 30],
    "b":[20, 40]
})

df2 = pd.DataFrame({
    "a":[1, 3],
    "b":[2, 4]
})

# mul()
print(df1.mul(df2))

# div()
print(df1.div(df2))

    a    b
0  10   40
1  90  160
      a     b
0  10.0  10.0
1  10.0  10.0


# 邏輯運算

In [10]:
# greater than: gt()  
print(s1.gt(s2))

# less than: lt()
print(s1.lt(s2))

# greater equal: ge()
print(s1.ge(s2))

# less equal: le()
print(s1.le(s2))

# equal: eq()
print(s1.eq(s2))

# not equal: ne()
print(s1.ne(s2))

0    False
1    False
2    False
dtype: bool
0    True
1    True
2    True
dtype: bool
0    False
1    False
2    False
dtype: bool
0    True
1    True
2    True
dtype: bool
0    False
1    False
2    False
dtype: bool
0    True
1    True
2    True
dtype: bool


# 應用 Numpy 函數

In [None]:
import numpy as np
s = pd.Series([1, 2, 3])

# np.square()
x = np.square(s)

print(x, type(x))

0    1
1    4
2    9
dtype: int64 <class 'pandas.core.series.Series'>


In [None]:
# np.random.randint()

name = ['Benny', 'Juno']
score = ["first", "second"]
df = pd.DataFrame(
    np.random.randint(60,100,size=(2,2)),  # ndarray 可以直接填入 DataFrame 的表格結構中。
    columns=name,
    index=score
)

df


Unnamed: 0,Benny,Juno
first,76,79
second,76,60


# NaN 的處理

In [18]:
# 對 NaN 的運算結果也為 NaN
s1 = pd.Series([1, np.nan, 5])
s2 = pd.Series([np.nan, 2, 5])

print(s1.add(s2))

0     NaN
1     NaN
2    10.0
dtype: float64


In [23]:
# dropna() 刪除 NaN 並回傳新 Series or DataFrame
print(s1.dropna())

# fillna(value) 將 NaN 取代成指定 value 並回傳新 Series or DataFrame
print(s1.fillna("補"))

# isna() 回傳 boolean
print(s1.isna())

# notna() 回傳 boolean
print(s1.notna())

0    1.0
2    5.0
dtype: float64
0    1.0
1      補
2    5.0
dtype: object
0    False
1     True
2    False
dtype: bool
0     True
1    False
2     True
dtype: bool


# 基礎統計
**累積(Cumulative)** : 即對於每一行（或列）的位置都會考慮之前的數據，並進行累積，常用於需要跟踪隨時間變化的最大值，比如股票價格或其他隨時間變化的數據。

In [30]:
df = pd.DataFrame({'A': [1, 3, 2, 5, 4], "B": [2, 1, 1, 1, 1], "C": [5, 1, 1, 1, 1]})

# cummax() 
print(df.iloc[0,].cummax(),"\n")

# max()
print(df.iloc[0,].max())

A    1
B    2
C    5
Name: 0, dtype: int64 

5


In [32]:
# cummin()
print(df["A"].cummin(),"\n")

# min()
print(df["A"].min())

0    1
1    1
2    1
3    1
4    1
Name: A, dtype: int64 

1


In [33]:
# cumsum()
print(df["A"].cumsum(),"\n")

# sum()
print(df["A"].sum())

0     1
1     4
2     6
3    11
4    15
Name: A, dtype: int64 

15


In [34]:
# mean()
print(df["A"].mean(),"\n")

# median()
print(df["A"].median(),"\n")

# std()
print(df["A"].std())

3.0 

3.0 

1.5811388300841898


# 增加 index

In [47]:
eng = [4, 3, 2]
math = [4, 3, 2]
social = [4, 3, 2]

df = pd.DataFrame(
    [eng, math, social],
    columns=["Eng", "Math", "Social"],
    index = range(3)   
)

total = [df.iloc[i].sum() for i in range(3)] # list
df["Total"] = total
print(df,"\n")

ave = df.mean() # default axis = 0
print(ave,"\n")

df.loc["Average"] = ave
print(df,"\n")


# drop 刪除 index 
df.drop(index=["Average"], inplace=True)
print(df)


   Eng  Math  Social  Total
0    4     3       2      9
1    4     3       2      9
2    4     3       2      9 

Eng       4.0
Math      3.0
Social    2.0
Total     9.0
dtype: float64 

         Eng  Math  Social  Total
0        4.0   3.0     2.0    9.0
1        4.0   3.0     2.0    9.0
2        4.0   3.0     2.0    9.0
Average  4.0   3.0     2.0    9.0 

   Eng  Math  Social  Total
0  4.0   3.0     2.0    9.0
1  4.0   3.0     2.0    9.0
2  4.0   3.0     2.0    9.0


# 排序

In [61]:
a = [6, 2, 9]
b = [9, 1, 9]
c = [9, 2, 8]

df = pd.DataFrame(
    [a, b, c],
    columns=["Eng", "Math", "Social"],
    index = range(3)   
)

total = [df.iloc[i].sum() for i in range(3)] # list
df["Total"] = total

ave = df.mean() # default axis = 0
df.loc["Average"] = ave

print(df,"\n")

df.sort_values(by='Total', ascending=False, inplace=True)
df.drop(index=["Average"], inplace=True)
df['Rank'] = df['Total'].rank(method='dense', ascending=False).astype(int) # method="dense" 相同的值會獲得相同的排名，並且後續的排名不會跳過。

print(df,"\n") 

df.sort_index(inplace=True)
print(df)


         Eng      Math    Social      Total
0        6.0  2.000000  9.000000  17.000000
1        9.0  1.000000  9.000000  19.000000
2        9.0  2.000000  8.000000  19.000000
Average  8.0  1.666667  8.666667  18.333333 

   Eng  Math  Social  Total  Rank
1  9.0   1.0     9.0   19.0     1
2  9.0   2.0     8.0   19.0     1
0  6.0   2.0     9.0   17.0     2 

   Eng  Math  Social  Total  Rank
0  6.0   2.0     9.0   17.0     2
1  9.0   1.0     9.0   19.0     1
2  9.0   2.0     8.0   19.0     1
