In [1]:
import pandas as pd

### Step1. 將TEJ下載下來的數據進行處理

In [2]:
### 下載txt檔案，容量較小好處理
df = pd.read_table('Vol.txt', encoding='ANSI', sep='\t')  # 假設以 Tab 為分隔符號
df

Unnamed: 0,證券代碼,簡稱,年月,成交量(百萬股)_月
0,1101,台泥,201301,171
1,1102,亞泥,201301,208
2,1103,嘉泥,201301,9
3,1104,環泥,201301,17
4,1108,幸福,201301,11
...,...,...,...,...
118886,9945,潤泰新,202406,282
118887,9946,三發地產,202406,71
118888,9955,佳龍,202406,23
118889,9958,世紀鋼,202406,193


### Step2. 將Columns名稱更改

In [3]:
df.columns=['Index','Chinese_Name','Date','Volumn']
df

Unnamed: 0,Index,Chinese_Name,Date,Volumn
0,1101,台泥,201301,171
1,1102,亞泥,201301,208
2,1103,嘉泥,201301,9
3,1104,環泥,201301,17
4,1108,幸福,201301,11
...,...,...,...,...
118886,9945,潤泰新,202406,282
118887,9946,三發地產,202406,71
118888,9955,佳龍,202406,23
118889,9958,世紀鋼,202406,193


### Step3. 資料前處理(處理空格、日期格式轉換、轉換成數值)

In [4]:
### TEJ 資料會有莫名空格，這邊先把空格去掉
for column in list(df.columns):
    df[column] = df[column].astype(str).str.strip()

In [5]:
### 日期格式轉換(如果是月資料，要轉換成月底那一天)
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m') + pd.offsets.MonthEnd(0)

In [6]:
### 轉換成數值
numeric_columns = ['Volumn']
for column in numeric_columns:
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [7]:
df

Unnamed: 0,Index,Chinese_Name,Date,Volumn
0,1101,台泥,2013-01-31,171
1,1102,亞泥,2013-01-31,208
2,1103,嘉泥,2013-01-31,9
3,1104,環泥,2013-01-31,17
4,1108,幸福,2013-01-31,11
...,...,...,...,...
118886,9945,潤泰新,2024-06-30,282
118887,9946,三發地產,2024-06-30,71
118888,9955,佳龍,2024-06-30,23
118889,9958,世紀鋼,2024-06-30,193


### Step4. 轉置成慣用的形式

In [8]:
## 盡量取其他的變數名稱，防止誤用
df2 = df.set_index(['Date','Index'])['Volumn'].unstack()
df2

Index,1101,1102,1103,1104,1108,1109,1110,1201,1203,1210,...,9939,9940,9941,9942,9943,9944,9945,9946,9955,9958
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-01-31,171.0,208.0,9.0,17.0,11.0,2.0,4.0,102.0,3.0,29.0,...,66.0,8.0,33.0,2.0,4.0,1.0,167.0,18.0,2.0,29.0
2013-02-28,141.0,68.0,4.0,11.0,9.0,2.0,2.0,47.0,2.0,13.0,...,28.0,5.0,26.0,1.0,3.0,1.0,81.0,10.0,1.0,13.0
2013-03-31,146.0,77.0,9.0,13.0,26.0,2.0,4.0,46.0,2.0,21.0,...,24.0,6.0,39.0,2.0,5.0,1.0,137.0,37.0,2.0,8.0
2013-04-30,112.0,71.0,5.0,8.0,7.0,1.0,26.0,48.0,2.0,28.0,...,41.0,10.0,18.0,2.0,5.0,3.0,128.0,61.0,2.0,12.0
2013-05-31,122.0,65.0,17.0,30.0,11.0,3.0,20.0,49.0,6.0,31.0,...,32.0,13.0,33.0,8.0,3.0,1.0,139.0,42.0,1.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-29,184.0,38.0,5.0,9.0,7.0,3.0,4.0,4.0,0.0,22.0,...,15.0,3.0,17.0,3.0,2.0,1.0,51.0,9.0,3.0,75.0
2024-03-31,348.0,95.0,7.0,36.0,28.0,14.0,2.0,10.0,1.0,58.0,...,18.0,7.0,45.0,6.0,8.0,2.0,150.0,20.0,19.0,152.0
2024-04-30,411.0,108.0,14.0,54.0,46.0,14.0,6.0,8.0,0.0,48.0,...,24.0,15.0,19.0,4.0,7.0,9.0,200.0,44.0,132.0,132.0
2024-05-31,486.0,204.0,13.0,33.0,27.0,14.0,9.0,18.0,0.0,34.0,...,24.0,14.0,35.0,3.0,5.0,21.0,784.0,59.0,53.0,262.0


### Step5. 存成pkl檔案(如果存成csv、excel檔案也可以，只是檔案比較大)

In [9]:
df2.to_pickle('Volumn.pkl')