## 数据预处理

In [1]:
import pandas as pd

txt_file = 'HSI_日线数据.txt'

# 成功使用GBK编码读取文件后，确保再次使用相同的编码
df = pd.read_csv(txt_file, sep='\t', encoding='gbk', skiprows=1)

# 删除最后一行
df = df[:-1]

# 查看数据确保正确读取
print(df.head())

# 将数据框保存为CSV文件
csv_file = 'HSI_日线数据.csv'
df.to_csv(csv_file, index=False)

           日期       开盘       最高       最低       收盘      成交量      成交额
0  1994-07-12  8591.45  8608.12  8398.43  8591.45      0.0      0.0
1  1994-07-13  8828.91  8828.91  8606.51  8828.91      0.0      0.0
2  1994-07-14  8808.28  8944.31  8790.79  8808.28      0.0      0.0
3  1994-07-15  9117.02  9140.84  8863.83  9117.02      0.0      0.0
4  1994-07-18  9193.83  9228.50  9131.54  9193.83      0.0      0.0


In [2]:
# 读取数据
df = pd.read_csv('HSI_日线数据.csv', encoding='utf-8')

# 删除列名中的前导和尾随空格
df.columns = df.columns.str.strip()

# 打印列名以确认更改
print("修正后的列名：", df.columns)

# 接下来的数据处理
# 处理日期格式
df['日期'] = pd.to_datetime(df['日期'])

# 确保数值列是数值类型
num_cols = ['开盘', '最高', '最低', '收盘', '成交量', '成交额']
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

# 检查缺失值
print("缺失值情况：")
print(df.isnull().sum())

# 处理或填充缺失数据
df[['成交量', '成交额']] = df[['成交量', '成交额']].fillna(0)


修正后的列名： Index(['日期', '开盘', '最高', '最低', '收盘', '成交量', '成交额'], dtype='object')
缺失值情况：
日期     0
开盘     0
最高     0
最低     0
收盘     0
成交量    0
成交额    0
dtype: int64


In [4]:
df.rename(columns={'日期': 'Date'}, inplace=True)

df.sort_values('Date', ascending=True, inplace=True)
df.set_index('Date', inplace=True)
df.head()

Unnamed: 0_level_0,开盘,最高,最低,收盘,成交量,成交额
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1994-07-12,8591.45,8608.12,8398.43,8591.45,0.0,0.0
1994-07-13,8828.91,8828.91,8606.51,8828.91,0.0,0.0
1994-07-14,8808.28,8944.31,8790.79,8808.28,0.0,0.0
1994-07-15,9117.02,9140.84,8863.83,9117.02,0.0,0.0
1994-07-18,9193.83,9228.5,9131.54,9193.83,0.0,0.0


In [5]:
df.rename(columns={'最高': 'High', '最低': 'Low', 
                   '开盘': 'Open', '收盘': 'Close', '成交量': 'Volume', 
                   '成交额': 'Turnover'}, inplace=True)

df.to_csv('../data/HSI_日线数据_处理后.csv', index=False, encoding='utf-8')
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Turnover
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1994-07-12,8591.45,8608.12,8398.43,8591.45,0.0,0.0
1994-07-13,8828.91,8828.91,8606.51,8828.91,0.0,0.0
1994-07-14,8808.28,8944.31,8790.79,8808.28,0.0,0.0
1994-07-15,9117.02,9140.84,8863.83,9117.02,0.0,0.0
1994-07-18,9193.83,9228.5,9131.54,9193.83,0.0,0.0


In [6]:
# 重新保存处理后的数据
df.to_csv('HSI_日线数据_处理后.csv', index = False)

## 环境搭建