## Pandas

- 파이썬에서 사용하는 데이터 분석 라이브러리
- 행과 열로 이루어진 데이터 객체를 만들어 다룰 수 있게 되며 보다 안정적으로 대용량의 데이터들을 처리하는데 매우 편리
numpy는 배열! pandas는 엑셀!
- https://pandas.pydata.org/docs/


## Series

In [3]:
# Series 하나의 열에 대하여 처리
import pandas as pd
obj = pd.Series([2,4,6,8,10])
print(obj)

0     2
1     4
2     6
3     8
4    10
dtype: int64


In [4]:
print(obj.values)
print(type(obj.values))

print(obj.index)
print(type(obj.index))

print(obj.dtype)
print(type(obj.dtype))

[ 2  4  6  8 10]
<class 'numpy.ndarray'>
RangeIndex(start=0, stop=5, step=1)
<class 'pandas.core.indexes.range.RangeIndex'>
int64
<class 'numpy.dtypes.Int64DType'>


In [5]:
obj = pd.Series([1,3,5,7,9], index = ["a", "b", "c", "d", "e"])
print(obj)

a    1
b    3
c    5
d    7
e    9
dtype: int64


In [6]:
# json 구조에서 키값이 index가 되는 형태
dic_data ={
    "x ":100,
    "y" : 200,
    "z":300
    
}
obj = pd.Series(dic_data)
print(obj)

x     100
y     200
z     300
dtype: int64


In [None]:
# 이미 선언된 index를 변경
obj.index = ["Q", "W", "E"]
print(obj)

Q    100
W    200
E    300
dtype: int64


In [None]:
# 이름 지어주기
obj.index.name = "idx"
obj.name = "my_data"
print(obj)

idx
Q    100
W    200
E    300
Name: my_data, dtype: int64


### Data Frame

In [10]:
# 행렬, Table 모양으로 데이터 처리
# 엑셀과 비슷
data = {
    "name" : ["A","B", "C", "D"],
    "age" : [20,21,22,23],
    "blood": ["B", "B", "A", "O"]
    
    
}
df = pd.DataFrame(data)
df

Unnamed: 0,name,age,blood
0,A,20,B
1,B,21,B
2,C,22,A
3,D,23,O


In [12]:
print(df.index)
print(df.columns)
print(df.values)
print(type(df.values))

RangeIndex(start=0, stop=4, step=1)
Index(['name', 'age', 'blood'], dtype='object')
[['A' 20 'B']
 ['B' 21 'B']
 ['C' 22 'A']
 ['D' 23 'O']]
<class 'numpy.ndarray'>


In [14]:
df.index.name = "No."
df.columns.name = "Info"
df

Info,name,age,blood
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,A,20,B
1,B,21,B
2,C,22,A
3,D,23,O


In [None]:
# 새로운 columns을 추가하고 값을 주지 않으면 NaN이라고 출려됨. 의미는 "없다"
df = pd.DataFrame(data, columns=["name","age", "blood","MBTI"], index=[1,2,3,4])
df

Unnamed: 0,name,age,blood,MBTI
1,A,20,B,
2,B,21,B,
3,C,22,A,
4,D,23,O,


In [None]:
# 중요!
# 해당 데이터에 대해 설명해줘라 : 숫자만 분석가능!
print(df.describe())

             age
count   4.000000
mean   21.500000
std     1.290994
min    20.000000
25%    20.750000
50%    21.500000
75%    22.250000
max    23.000000


In [None]:
# 저장된 데이터 출력하기
print(df["name"])
print(df.name)

1    A
2    B
3    C
4    D
Name: name, dtype: object
1    A
2    B
3    C
4    D
Name: name, dtype: object


In [None]:
# 2개 이상의 column 조회 가능
print(df[["name","MBTI"]])
# print(df[["name"],["MBTI"]]) 이 형태는 오류가 난다...!

  name MBTI
1    A  NaN
2    B  NaN
3    C  NaN
4    D  NaN


InvalidIndexError: (['name'], ['MBTI'])

In [20]:
df["point"] = 0
df

Unnamed: 0,name,age,blood,MBTI,point
1,A,20,B,,0
2,B,21,B,,0
3,C,22,A,,0
4,D,23,O,,0


In [22]:
df ["point"] = [100,200,300,0]
df

Unnamed: 0,name,age,blood,MBTI,point
1,A,20,B,,100
2,B,21,B,,200
3,C,22,A,,300
4,D,23,O,,0


In [23]:
import numpy as np

df["np_idx"] = np.arange(4)
df

Unnamed: 0,name,age,blood,MBTI,point,np_idx
1,A,20,B,,100,0
2,B,21,B,,200,1
3,C,22,A,,300,2
4,D,23,O,,0,3


In [None]:
# 중요!
# 특정 인덱스에 minus 값 입력하기

val = pd.Series([-1.2,-1.5,-1.7], index = [2,3,4])
df["minus"] = val
df

Unnamed: 0,name,age,blood,MBTI,point,np_idx,minus
1,A,20,B,,100,0,
2,B,21,B,,200,1,-1.2
3,C,22,A,,300,2,-1.5
4,D,23,O,,0,3,-1.7


In [28]:
df["np_idx"]= df["age"]
df

Unnamed: 0,name,age,blood,MBTI,point,np_idx,minus
1,A,20,B,,100,20,
2,B,21,B,,200,21,-1.2
3,C,22,A,,300,22,-1.5
4,D,23,O,,0,23,-1.7


In [31]:
df["bool_test"] = df["age"] % 2 == 0
df

Unnamed: 0,name,age,blood,MBTI,point,np_idx,minus,bool_test
1,A,20,B,,100,20,,True
2,B,21,B,,200,21,-1.2,False
3,C,22,A,,300,22,-1.5,True
4,D,23,O,,0,23,-1.7,False


In [32]:
df.index.name = "No."
df.columns.name = "Info"
df

Info,name,age,blood,MBTI,point,np_idx,minus,bool_test
No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,A,20,B,,100,20,,True
2,B,21,B,,200,21,-1.2,False
3,C,22,A,,300,22,-1.5,True
4,D,23,O,,0,23,-1.7,False


In [None]:
print(df[0:2]) # 우와!

Info name  age blood MBTI  point  np_idx  minus  bool_test
No.                                                       
1       A   20     B  NaN    100      20    NaN       True
2       B   21     B  NaN    200      21   -1.2      False


In [None]:
df.index = ["ONE","TWO","THREE","FOUR"]
print(df)
print(df["TWO":"FOUR"])


Info  name  age blood MBTI  point  np_idx  minus  bool_test
ONE      A   20     B  NaN    100      20    NaN       True
TWO      B   21     B  NaN    200      21   -1.2      False
THREE    C   22     A  NaN    300      22   -1.5       True
FOUR     D   23     O  NaN      0      23   -1.7      False
Info  name  age blood MBTI  point  np_idx  minus  bool_test
TWO      B   21     B  NaN    200      21   -1.2      False
THREE    C   22     A  NaN    300      22   -1.5       True
FOUR     D   23     O  NaN      0      23   -1.7      False


In [None]:
# location ⭐️
print(df.loc["TWO"])
print("="*20)
print(df.loc[:,"name":"blood"])

Info
name             B
age             21
blood            B
MBTI           NaN
point          200
np_idx          21
minus         -1.2
bool_test    False
Name: TWO, dtype: object
Info  name  age blood
ONE      A   20     B
TWO      B   21     B
THREE    C   22     A
FOUR     D   23     O


In [38]:
print(df.loc["TWO": "THREE", "point"])
print("="*20)
print(df.loc[:, "name":"blood"])

TWO      200
THREE    300
Name: point, dtype: int64
Info  name  age blood
ONE      A   20     B
TWO      B   21     B
THREE    C   22     A
FOUR     D   23     O


In [39]:
# 칼럼 삭제
del df["np_idx"]
df

Info,name,age,blood,MBTI,point,minus,bool_test
ONE,A,20,B,,100,,True
TWO,B,21,B,,200,-1.2,False
THREE,C,22,A,,300,-1.5,True
FOUR,D,23,O,,0,-1.7,False


In [40]:
df.loc["FIVE",:] = ["E", 30, "AB", "ISTP", 0, 1, False]
df

Info,name,age,blood,MBTI,point,minus,bool_test
ONE,A,20.0,B,,100.0,,True
TWO,B,21.0,B,,200.0,-1.2,False
THREE,C,22.0,A,,300.0,-1.5,True
FOUR,D,23.0,O,,0.0,-1.7,False
FIVE,E,30.0,AB,ISTP,0.0,1.0,False


In [None]:
# iloc : index location
print(df.iloc[1])
print("="*20)
print(df.iloc[1:2])

Info
name             B
age           21.0
blood            B
MBTI           NaN
point        200.0
minus         -1.2
bool_test    False
Name: TWO, dtype: object
Info name   age blood MBTI  point  minus bool_test
TWO     B  21.0     B  NaN  200.0   -1.2     False


In [43]:
print(df.iloc[0:2, 0:2])
print(df.iloc[[0,1,3],
            [0,3]])
print(df.iloc[:,1:4])

Info name   age
ONE     A  20.0
TWO     B  21.0
Info name MBTI
ONE     A  NaN
TWO     B  NaN
FOUR    D  NaN
Info    age blood  MBTI
ONE    20.0     B   NaN
TWO    21.0     B   NaN
THREE  22.0     A   NaN
FOUR   23.0     O   NaN
FIVE   30.0    AB  ISTP


In [None]:
print(df["age"] < 22)

ONE       True
TWO       True
THREE    False
FOUR     False
FIVE     False
Name: age, dtype: bool


In [47]:
print(df.loc[df["age"] < 22,:])

Info name   age blood MBTI  point  minus bool_test
ONE     A  20.0     B  NaN  100.0    NaN      True
TWO     B  21.0     B  NaN  200.0   -1.2     False


In [48]:
print(df.loc[df["name"] == "A", ["name", "age"]])
print("="* 20)
print(df.loc[(df["name"] == "A") | (df["name"] == "B"), ["name","age"]])

Info name   age
ONE     A  20.0
Info name   age
ONE     A  20.0
TWO     B  21.0


In [50]:
df.loc[df["point"] == 0, "point"] = 10000
df

Info,name,age,blood,MBTI,point,minus,bool_test
ONE,A,20.0,B,,100.0,,True
TWO,B,21.0,B,,200.0,-1.2,False
THREE,C,22.0,A,,300.0,-1.5,True
FOUR,D,23.0,O,,10000.0,-1.7,False
FIVE,E,30.0,AB,ISTP,10000.0,1.0,False


### data

In [51]:
df = pd.DataFrame(np.random.randn(6,4))
df

Unnamed: 0,0,1,2,3
0,-0.010583,2.292658,-1.401884,-0.424774
1,0.774703,1.653749,0.35548,0.112402
2,-0.847,-0.301473,1.701972,0.750698
3,0.702933,-0.392122,-0.046122,1.206076
4,-0.397919,0.052,1.166403,1.35371
5,-0.515099,-0.828885,0.954446,1.26524


In [54]:
df.columns = ["A","B","C","D"]
df.index = pd.date_range("20260101", periods=6)
print(df.index)
print("="*100)
print(df)

DatetimeIndex(['2026-01-01', '2026-01-02', '2026-01-03', '2026-01-04',
               '2026-01-05', '2026-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774
2026-01-02  0.774703  1.653749  0.355480  0.112402
2026-01-03 -0.847000 -0.301473  1.701972  0.750698
2026-01-04  0.702933 -0.392122 -0.046122  1.206076
2026-01-05 -0.397919  0.052000  1.166403  1.353710
2026-01-06 -0.515099 -0.828885  0.954446  1.265240


In [None]:
# .nan 없다 표시 가능
df["F"] = [1.0, np.nan, 3.5, 6.1, np.nan, 7.0]
df

Unnamed: 0,A,B,C,D,F
2026-01-01,-0.010583,2.292658,-1.401884,-0.424774,1.0
2026-01-02,0.774703,1.653749,0.35548,0.112402,
2026-01-03,-0.847,-0.301473,1.701972,0.750698,3.5
2026-01-04,0.702933,-0.392122,-0.046122,1.206076,6.1
2026-01-05,-0.397919,0.052,1.166403,1.35371,
2026-01-06,-0.515099,-0.828885,0.954446,1.26524,7.0


In [None]:
# dropna 빼기?
print(df.dropna(how="any")) # nan이 있는 데이터를 제거하고 볼때 사용 실제로 데이터가 지워지는게 아니고 빼고 잠깐 보는 용도로 사용
print("="*100)
print(df)

                   A         B         C         D    F
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774  1.0
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0
                   A         B         C         D    F
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774  1.0
2026-01-02  0.774703  1.653749  0.355480  0.112402  NaN
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-05 -0.397919  0.052000  1.166403  1.353710  NaN
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0


In [None]:
print(df.dropna(how="all")) # 모든 nan 제거 하고 보여줌.
print("="*100)
print(df)

                   A         B         C         D    F
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774  1.0
2026-01-02  0.774703  1.653749  0.355480  0.112402  NaN
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-05 -0.397919  0.052000  1.166403  1.353710  NaN
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0
                   A         B         C         D    F
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774  1.0
2026-01-02  0.774703  1.653749  0.355480  0.112402  NaN
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-05 -0.397919  0.052000  1.166403  1.353710  NaN
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0


In [59]:
# 비어 있는 칼럼은 특정 값으로 채우기
print(df.fillna(value=0.5))

                   A         B         C         D    F
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774  1.0
2026-01-02  0.774703  1.653749  0.355480  0.112402  0.5
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-05 -0.397919  0.052000  1.166403  1.353710  0.5
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0


In [60]:
print(df.isnull())

                A      B      C      D      F
2026-01-01  False  False  False  False  False
2026-01-02  False  False  False  False   True
2026-01-03  False  False  False  False  False
2026-01-04  False  False  False  False  False
2026-01-05  False  False  False  False   True
2026-01-06  False  False  False  False  False


In [61]:
print(df.loc[df.isnull()["F"],:])

                   A         B         C         D   F
2026-01-02  0.774703  1.653749  0.355480  0.112402 NaN
2026-01-05 -0.397919  0.052000  1.166403  1.353710 NaN


In [62]:
pd.to_datetime("20260102")

Timestamp('2026-01-02 00:00:00')

In [None]:
print(df.drop(pd.to_datetime("20260102")))
print("=" * 100)
df.drop([pd.to_datetime("20260102"),pd.to_datetime("20260104")])

df

                   A         B         C         D    F
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774  1.0
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-05 -0.397919  0.052000  1.166403  1.353710  NaN
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0


Unnamed: 0,A,B,C,D,F
2026-01-01,-0.010583,2.292658,-1.401884,-0.424774,1.0
2026-01-02,0.774703,1.653749,0.35548,0.112402,
2026-01-03,-0.847,-0.301473,1.701972,0.750698,3.5
2026-01-04,0.702933,-0.392122,-0.046122,1.206076,6.1
2026-01-05,-0.397919,0.052,1.166403,1.35371,
2026-01-06,-0.515099,-0.828885,0.954446,1.26524,7.0


In [None]:
print(df.drop("F", axis=1))
df


                   A         B         C         D
2026-01-01 -0.010583  2.292658 -1.401884 -0.424774
2026-01-02  0.774703  1.653749  0.355480  0.112402
2026-01-03 -0.847000 -0.301473  1.701972  0.750698
2026-01-04  0.702933 -0.392122 -0.046122  1.206076
2026-01-05 -0.397919  0.052000  1.166403  1.353710
2026-01-06 -0.515099 -0.828885  0.954446  1.265240


Unnamed: 0,A,B,C,D,F
2026-01-01,-0.010583,2.292658,-1.401884,-0.424774,1.0
2026-01-02,0.774703,1.653749,0.35548,0.112402,
2026-01-03,-0.847,-0.301473,1.701972,0.750698,3.5
2026-01-04,0.702933,-0.392122,-0.046122,1.206076,6.1
2026-01-05,-0.397919,0.052,1.166403,1.35371,
2026-01-06,-0.515099,-0.828885,0.954446,1.26524,7.0


In [72]:
print(df.drop("20260101",axis=0))

df


                   A         B         C         D    F
2026-01-02  0.774703  1.653749  0.355480  0.112402  NaN
2026-01-03 -0.847000 -0.301473  1.701972  0.750698  3.5
2026-01-04  0.702933 -0.392122 -0.046122  1.206076  6.1
2026-01-05 -0.397919  0.052000  1.166403  1.353710  NaN
2026-01-06 -0.515099 -0.828885  0.954446  1.265240  7.0


Unnamed: 0,A,B,C,D,F
2026-01-01,-0.010583,2.292658,-1.401884,-0.424774,1.0
2026-01-02,0.774703,1.653749,0.35548,0.112402,
2026-01-03,-0.847,-0.301473,1.701972,0.750698,3.5
2026-01-04,0.702933,-0.392122,-0.046122,1.206076,6.1
2026-01-05,-0.397919,0.052,1.166403,1.35371,
2026-01-06,-0.515099,-0.828885,0.954446,1.26524,7.0


### 함수

In [73]:
data = [[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]]
df = pd.DataFrame(data, columns=['one', 'two'], index=['a', 'b', 'c', 'd'])
df


Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [74]:
print(df.head(2))

   one  two
a  1.4  NaN
b  7.1 -4.5


In [75]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, a to d
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   one     3 non-null      float64
 1   two     2 non-null      float64
dtypes: float64(2)
memory usage: 96.0+ bytes
None


In [78]:
print(df.sum(axis=0))
print(df.sum(axis=1))
print(df.sum(axis=0, skipna = False))

one    9.25
two   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64
one   NaN
two   NaN
dtype: float64


In [79]:
print(df['one'].sum())
print(df.loc['b'].sum())

9.25
2.5999999999999996


In [80]:
# count - 전체 성분의(NaN이 아닌)값의 갯수를 계산
# min // max - 전체 성본의 최소, 최댓값을 계산
# argmin // argmax - 전체 성분의 최솟값, 최댓값이 위치한 (정수)인덱스를 반환
# idxmin // idxmax - 전체 인덱스 중 최솟값, 최댓값을 반환
# quantile - 전체 성분의 특정 사분위수에 해당하는 값을 반환
# sum - 전체 성분의 합을 계산
# mean - 전체 성분의 평균을 계산
# median - 전체 성분의 중간값을 반환
# mad - 전체 성분의 평균값으로부터의 절대 편차(absolute deviation)의 평균을 계산
# std // var - 전체 성분의 표준편차, 분산을 계산
# cumsum - 맨 첫 번째 성분부터 각 성분까지의 누적합을 계산(0에서부터 계속 더해짐)
# cumprod - 맨 첫 번째 성분부터 각 성분까지의 누적곱을 계산 (1에서부터 계속 곱해짐)

In [83]:
df = pd.DataFrame(np.random.randn(6,4),
                  columns=["A","b","C","D"],
                  index = pd.date_range("20260101", periods=6))
print(df)
print("="* 100)
dates = df.index
random_dates = np.random.permutation(dates)
df = df.reindex(index=random_dates, columns=["D", "B", "C", "A"])
df

                   A         b         C             D
2026-01-01 -0.163644  2.018152  0.283935 -3.591348e-01
2026-01-02  1.584700  1.271968  0.374535 -1.318984e-07
2026-01-03  1.287597 -2.034548  0.478180  1.775669e+00
2026-01-04 -1.535179 -1.686221 -2.037179  5.474350e-02
2026-01-05  2.639789  0.379181  0.030904  3.329304e-01
2026-01-06 -0.705714  0.065238 -0.410948 -5.372677e-01


Unnamed: 0,D,B,C,A
2026-01-02,-1.318984e-07,,0.374535,1.5847
2026-01-03,1.775669,,0.47818,1.287597
2026-01-01,-0.3591348,,0.283935,-0.163644
2026-01-04,0.0547435,,-2.037179,-1.535179
2026-01-06,-0.5372677,,-0.410948,-0.705714
2026-01-05,0.3329304,,0.030904,2.639789


In [88]:
print(df.sort_index(axis=0))
print(df.sort_index(axis=0).sort_index(axis=1))

print(df.sort_index(axis=1))
print(df.sort_index(axis=1, ascending=False))

                       D   B         C         A
2026-01-01 -3.591348e-01 NaN  0.283935 -0.163644
2026-01-02 -1.318984e-07 NaN  0.374535  1.584700
2026-01-03  1.775669e+00 NaN  0.478180  1.287597
2026-01-04  5.474350e-02 NaN -2.037179 -1.535179
2026-01-05  3.329304e-01 NaN  0.030904  2.639789
2026-01-06 -5.372677e-01 NaN -0.410948 -0.705714
                   A   B         C             D
2026-01-01 -0.163644 NaN  0.283935 -3.591348e-01
2026-01-02  1.584700 NaN  0.374535 -1.318984e-07
2026-01-03  1.287597 NaN  0.478180  1.775669e+00
2026-01-04 -1.535179 NaN -2.037179  5.474350e-02
2026-01-05  2.639789 NaN  0.030904  3.329304e-01
2026-01-06 -0.705714 NaN -0.410948 -5.372677e-01
                   A   B         C             D
2026-01-02  1.584700 NaN  0.374535 -1.318984e-07
2026-01-03  1.287597 NaN  0.478180  1.775669e+00
2026-01-01 -0.163644 NaN  0.283935 -3.591348e-01
2026-01-04 -1.535179 NaN -2.037179  5.474350e-02
2026-01-06 -0.705714 NaN -0.410948 -5.372677e-01
2026-01-05  2.639789

In [86]:
print(df.sort_values(by='D'))

                       D   B         C         A
2026-01-06 -5.372677e-01 NaN -0.410948 -0.705714
2026-01-01 -3.591348e-01 NaN  0.283935 -0.163644
2026-01-02 -1.318984e-07 NaN  0.374535  1.584700
2026-01-04  5.474350e-02 NaN -2.037179 -1.535179
2026-01-05  3.329304e-01 NaN  0.030904  2.639789
2026-01-03  1.775669e+00 NaN  0.478180  1.287597


In [91]:
df["E"] = np.random.randint(0,6, size=6)
df["F"]= ["alpha", "beta", "gamma","gamma", "alpha", "gamma"]
print(df)
print(df.sort_values(by=['E','F']))


                       D   B         C         A  E      F
2026-01-02 -1.318984e-07 NaN  0.374535  1.584700  5  alpha
2026-01-03  1.775669e+00 NaN  0.478180  1.287597  4   beta
2026-01-01 -3.591348e-01 NaN  0.283935 -0.163644  2  gamma
2026-01-04  5.474350e-02 NaN -2.037179 -1.535179  4  gamma
2026-01-06 -5.372677e-01 NaN -0.410948 -0.705714  4  alpha
2026-01-05  3.329304e-01 NaN  0.030904  2.639789  0  gamma
                       D   B         C         A  E      F
2026-01-05  3.329304e-01 NaN  0.030904  2.639789  0  gamma
2026-01-01 -3.591348e-01 NaN  0.283935 -0.163644  2  gamma
2026-01-06 -5.372677e-01 NaN -0.410948 -0.705714  4  alpha
2026-01-03  1.775669e+00 NaN  0.478180  1.287597  4   beta
2026-01-04  5.474350e-02 NaN -2.037179 -1.535179  4  gamma
2026-01-02 -1.318984e-07 NaN  0.374535  1.584700  5  alpha


In [92]:
print(df['F'].unique())
print(df['F'].value_counts())

['alpha' 'beta' 'gamma']
F
gamma    3
alpha    2
beta     1
Name: count, dtype: int64
