In [1]:
import pandas as pd
import polars as pl
import utils
from datetime import datetime


In [4]:
def data_info(df):
    print(df.shape)
    # 查看过滤后的 DATE 范围（以整数形式获取最小和最大值）
    min_date = df.select(pl.col("DATE").min()).to_series()[0]
    max_date = df.select(pl.col("DATE").max()).to_series()[0]
    print(f"\n过滤后的数据 DATE 范围（整数格式）：最小值 = {min_date}, 最大值 = {max_date}")

    # 如果希望将整数格式转换为 datetime 格式，可以使用如下函数
    def int_to_date(date_int):
        return datetime.strptime(str(date_int), "%Y%m%d")

    print(f"转换后：最小日期 = {int_to_date(min_date)}, 最大日期 = {int_to_date(max_date)}")

    # 统计唯一的 permno 数量
    permno_count = df.select(pl.col("permno").n_unique()).to_series()[0]
    print(f"\n唯一 permno 数量：{permno_count}")

In [123]:
sub6_data = utils.load_data('data/sub6_data.csv')
data_info(sub6_data)

(762136, 97)

过滤后的数据 DATE 范围（整数格式）：最小值 = 20110131, 最大值 = 20211231
转换后：最小日期 = 2011-01-31 00:00:00, 最大日期 = 2021-12-31 00:00:00

唯一 permno 数量：10734


In [68]:
vgnn_data = utils.load_data('data/saved_data.csv')
data_info(vgnn_data)

(109800, 98)

过滤后的数据 DATE 范围（整数格式）：最小值 = 20180131, 最大值 = 20201231
转换后：最小日期 = 2018-01-31 00:00:00, 最大日期 = 2020-12-31 00:00:00

唯一 permno 数量：3050


In [69]:
# 假设 vgnn_data 和 sub6_data 都是 Polars DataFrame
# 获取小表中唯一的 permno 值
permno_list = vgnn_data['permno'].unique()

# 使用 Polars 的 is_in 方法筛选大表
filtered_large_df = sub6_data.filter(pl.col('permno').is_in(permno_list))

# 查看筛选后大表中唯一 permno 的数量
filtered_permno_count = filtered_large_df['permno'].n_unique()
print(f"筛选后大表中 permno 的数量: {filtered_permno_count}")


筛选后大表中 permno 的数量: 3050


In [70]:
# 找出只存在于小表中的列（即大表中缺失的列）
columns_only_in_small = set(vgnn_data.columns) - set(filtered_large_df.columns)
# 找出只存在于大表中的列（小表中缺失的列）
columns_only_in_big = set(filtered_large_df.columns) - set(vgnn_data.columns)

print("小表中有而大表中没有的列：", columns_only_in_small)
print("大表中有而小表中没有的列：", columns_only_in_big)

小表中有而大表中没有的列： {'RET_target'}
大表中有而小表中没有的列： set()


In [10]:
specific_permno = 10026

filtered_df = filtered_large_df.filter(pl.col("permno") == specific_permno)

print(f"\npermno {specific_permno} 的数据:")
print(filtered_df)

# 显示该 permno 的数据行数
print(f"\npermno {specific_permno} 的数据行数:", filtered_df.shape[0])

# 如果想查看特定列，可以选择需要的列
# 例如只查看 DATE 和其他感兴趣的列
selected_columns = ["permno", "DATE", "mvel1"]  # 添加你感兴趣的其他列
filtered_df_selected = filtered_df.select(selected_columns)
print(f"\npermno {specific_permno} 的选定列数据:")
print(filtered_df_selected)


permno 10026 的数据:
shape: (132, 97)
┌────────┬──────────┬──────────────┬──────────┬───┬────────────┬──────────┬───────────┬──────┐
│ permno ┆ DATE     ┆ mvel1        ┆ beta     ┆ … ┆ std_dolvol ┆ std_turn ┆ zerotrade ┆ sic2 │
│ ---    ┆ ---      ┆ ---          ┆ ---      ┆   ┆ ---        ┆ ---      ┆ ---       ┆ ---  │
│ i64    ┆ i64      ┆ f64          ┆ f64      ┆   ┆ f64        ┆ f64      ┆ f64       ┆ f64  │
╞════════╪══════════╪══════════════╪══════════╪═══╪════════════╪══════════╪═══════════╪══════╡
│ 10026  ┆ 20110131 ┆ 894466.11112 ┆ 0.563942 ┆ … ┆ 0.651141   ┆ 1.825713 ┆ 3.2199e-8 ┆ 20.0 │
│ 10026  ┆ 20110228 ┆ 788710.39267 ┆ 0.564159 ┆ … ┆ 0.368276   ┆ 1.370279 ┆ 3.1404e-8 ┆ 20.0 │
│ 10026  ┆ 20110331 ┆ 816566.89267 ┆ 0.581094 ┆ … ┆ 0.576531   ┆ 2.265086 ┆ 3.1326e-8 ┆ 20.0 │
│ 10026  ┆ 20110429 ┆ 874513.52433 ┆ 0.581966 ┆ … ┆ 0.509962   ┆ 2.176758 ┆ 2.4832e-8 ┆ 20.0 │
│ 10026  ┆ 20110531 ┆ 944438.87433 ┆ 0.57539  ┆ … ┆ 0.325939   ┆ 0.763993 ┆ 4.6153e-8 ┆ 20.0 │
│ …      ┆ …  

In [72]:
date_filtered = result_df.filter(pl.col("DATE") >= 20180131)
print(date_filtered.head(10))
filtered_vgnn_data_df = vgnn_data.filter(pl.col("permno") == specific_permno)
print(filtered_vgnn_data_df[['permno', 'DATE', 'mvel1', 'RET_target']].head(10))

shape: (10, 4)
┌────────┬──────────┬────────────┬──────────────┐
│ permno ┆ DATE     ┆ mvel1      ┆ mvel1_return │
│ ---    ┆ ---      ┆ ---        ┆ ---          │
│ i64    ┆ i64      ┆ f64        ┆ f64          │
╞════════╪══════════╪════════════╪══════════════╡
│ 10026  ┆ 20180131 ┆ 2.8344e6   ┆ -0.00498     │
│ 10026  ┆ 20180228 ┆ 2.5858e6   ┆ 0.087702     │
│ 10026  ┆ 20180329 ┆ 2.5090e6   ┆ 0.029688     │
│ 10026  ┆ 20180430 ┆ 2.5533e6   ┆ -0.017635    │
│ 10026  ┆ 20180531 ┆ 2.5698e6   ┆ -0.006494    │
│ 10026  ┆ 20180629 ┆ 2.6486e6   ┆ -0.030638    │
│ 10026  ┆ 20180731 ┆ 2.8507e6   ┆ -0.076326    │
│ 10026  ┆ 20180831 ┆ 2.7131e6   ┆ 0.048289     │
│ 10026  ┆ 20180928 ┆ 2.723178e6 ┆ -0.003725    │
│ 10026  ┆ 20181031 ┆ 2.8298e6   ┆ -0.03915     │
└────────┴──────────┴────────────┴──────────────┘
shape: (10, 4)
┌────────┬──────────┬──────────┬────────────┐
│ permno ┆ DATE     ┆ mvel1    ┆ RET_target │
│ ---    ┆ ---      ┆ ---      ┆ ---        │
│ i64    ┆ i64      ┆ f64      ┆

In [56]:
ret_data = pd.read_csv("data/ret_data.csv")
# 筛选 PERMNO 为 15113 的行
filtered_df = ret_data[ret_data['PERMNO'] == 15113]

# 查看结果
print(filtered_df)
print(ret_data.head(10))

        PERMNO      date TICKER     CUSIP        RET
159890   15113  20141128    NaN  76131D10        NaN
159891   15113  20141231    QSR  76131D10          C
159892   15113  20150130    QSR  76131D10  -0.009221
159893   15113  20150227    QSR  76131D10   0.145812
159894   15113  20150331    QSR  76131D10  -0.131585
...        ...       ...    ...       ...        ...
159972   15113  20210930    QSR  76131D10  -0.038779
159973   15113  20211029    QSR  76131D10  -0.074359
159974   15113  20211130    QSR  76131D10  -0.010770
159975   15113  20211231    QSR  76131D10   0.092450
159976   15113  20220131    QSR  76131D10  -0.077620

[87 rows x 5 columns]
   PERMNO      date TICKER     CUSIP        RET
0   10001  20110131   EGAS  36720410   0.028992
1   10001  20110228   EGAS  36720410   0.022727
2   10001  20110331   EGAS  36720410   0.072404
3   10001  20110429   EGAS  36720410  -0.038789
4   10001  20110531   EGAS  36720410   0.028050
5   10001  20110630   EGAS  36720410   0.008261
6   1

In [51]:
def convert_file_for_polars(input_path, output_path):
    # 使用 pandas 读取文件
    df = pd.read_csv(input_path)
    
    # 处理特殊字符，如 'B' 值，将 'B' 替换为 None/NaN，
    # 只对除 TICKER 之外的字符串列进行数值转换
    for column in df.columns:
        if column == 'TICKER':
            continue
        if df[column].dtype == 'object':
            df[column] = pd.to_numeric(df[column].replace('B', None), errors='coerce')
    
    # 重命名列
    column_mapping = {
        'PERMNO': 'permno',
        'date': 'DATE',
        'RET': 'RET_target'
    }
    df = df.rename(columns=column_mapping)
    
    # 保存为 CSV，na_rep='' 使空值写成空字符串
    df.to_csv(output_path, index=False, na_rep='')

# 示例调用
# convert_file_for_polars("input.csv", "output.csv")


In [52]:
convert_file_for_polars('data/ret_data.csv', 'data/raw_data/return_processed.csv')
ret = utils.load_data('data/raw_data/return_processed.csv')

In [53]:
ret = utils.load_data('data/raw_data/return_processed.csv')

data_info(ret)
print(ret.head(10))

(793967, 5)

过滤后的数据 DATE 范围（整数格式）：最小值 = 20110131, 最大值 = 20220131
转换后：最小日期 = 2011-01-31 00:00:00, 最大日期 = 2022-01-31 00:00:00

唯一 permno 数量：10734
shape: (10, 5)
┌────────┬──────────┬────────┬────────────┬────────────┐
│ permno ┆ DATE     ┆ TICKER ┆ CUSIP      ┆ RET_target │
│ ---    ┆ ---      ┆ ---    ┆ ---        ┆ ---        │
│ i64    ┆ i64      ┆ str    ┆ f64        ┆ f64        │
╞════════╪══════════╪════════╪════════════╪════════════╡
│ 10001  ┆ 20110131 ┆ EGAS   ┆ 3.672041e7 ┆ 0.028992   │
│ 10001  ┆ 20110228 ┆ EGAS   ┆ 3.672041e7 ┆ 0.022727   │
│ 10001  ┆ 20110331 ┆ EGAS   ┆ 3.672041e7 ┆ 0.072404   │
│ 10001  ┆ 20110429 ┆ EGAS   ┆ 3.672041e7 ┆ -0.038789  │
│ 10001  ┆ 20110531 ┆ EGAS   ┆ 3.672041e7 ┆ 0.02805    │
│ 10001  ┆ 20110630 ┆ EGAS   ┆ 3.672041e7 ┆ 0.008261   │
│ 10001  ┆ 20110729 ┆ EGAS   ┆ 3.672041e7 ┆ -0.028139  │
│ 10001  ┆ 20110831 ┆ EGAS   ┆ 3.672041e7 ┆ -0.013864  │
│ 10001  ┆ 20110930 ┆ EGAS   ┆ 3.672041e7 ┆ 0.005009   │
│ 10001  ┆ 20111031 ┆ EGAS   ┆ 3.672041e7 ┆

In [57]:
specific_permno = 10026

ret_f = ret.filter(pl.col("permno") == specific_permno)

print(f"\npermno {specific_permno} 的数据:")
print(ret_f)

# 显示该 permno 的数据行数
print(f"\npermno {specific_permno} 的数据行数:", ret_f.shape[0])


permno 10026 的数据:
shape: (133, 5)
┌────────┬──────────┬────────┬────────────┬────────────┐
│ permno ┆ DATE     ┆ TICKER ┆ CUSIP      ┆ RET_target │
│ ---    ┆ ---      ┆ ---    ┆ ---        ┆ ---        │
│ i64    ┆ i64      ┆ str    ┆ f64        ┆ f64        │
╞════════╪══════════╪════════╪════════════╪════════════╡
│ 10026  ┆ 20110131 ┆ JJSF   ┆ 4.660321e7 ┆ -0.11961   │
│ 10026  ┆ 20110228 ┆ JJSF   ┆ 4.660321e7 ┆ 0.035319   │
│ 10026  ┆ 20110331 ┆ JJSF   ┆ 4.660321e7 ┆ 0.073175   │
│ 10026  ┆ 20110429 ┆ JJSF   ┆ 4.660321e7 ┆ 0.079669   │
│ 10026  ┆ 20110531 ┆ JJSF   ┆ 4.660321e7 ┆ 0.013774   │
│ …      ┆ …        ┆ …      ┆ …          ┆ …          │
│ 10026  ┆ 20210930 ┆ JJSF   ┆ 4.660321e7 ┆ -0.06294   │
│ 10026  ┆ 20211029 ┆ JJSF   ┆ 4.660321e7 ┆ -0.034485  │
│ 10026  ┆ 20211130 ┆ JJSF   ┆ 4.660321e7 ┆ -0.074348  │
│ 10026  ┆ 20211231 ┆ JJSF   ┆ 4.660321e7 ┆ 0.161173   │
│ 10026  ┆ 20220131 ┆ JJSF   ┆ 4.660321e7 ┆ -0.039694  │
└────────┴──────────┴────────┴────────────┴──────────

In [116]:
ret_f = ret_f.filter(pl.col("DATE") >= 20180131)
print(ret_f.head(10))
filtered_vgnn_data_df = vgnn_data.filter(pl.col("permno") == specific_permno)
print(filtered_vgnn_data_df[['permno', 'DATE', 'mvel1', 'RET_target']].head(10))

shape: (10, 5)
┌────────┬──────────┬────────┬────────────┬────────────┐
│ permno ┆ DATE     ┆ TICKER ┆ CUSIP      ┆ RET_target │
│ ---    ┆ ---      ┆ ---    ┆ ---        ┆ ---        │
│ i64    ┆ i64      ┆ str    ┆ f64        ┆ f64        │
╞════════╪══════════╪════════╪════════════╪════════════╡
│ 10026  ┆ 20180131 ┆ null   ┆ 4.660321e7 ┆ -0.088191  │
│ 10026  ┆ 20180228 ┆ null   ┆ 4.660321e7 ┆ -0.029688  │
│ 10026  ┆ 20180329 ┆ null   ┆ 4.660321e7 ┆ 0.019951   │
│ 10026  ┆ 20180430 ┆ null   ┆ 4.660321e7 ┆ 0.006224   │
│ 10026  ┆ 20180531 ┆ null   ┆ 4.660321e7 ┆ 0.030638   │
│ 10026  ┆ 20180629 ┆ null   ┆ 4.660321e7 ┆ 0.079791   │
│ 10026  ┆ 20180731 ┆ null   ┆ 4.660321e7 ┆ -0.049256  │
│ 10026  ┆ 20180831 ┆ null   ┆ 4.660321e7 ┆ 0.003725   │
│ 10026  ┆ 20180928 ┆ null   ┆ 4.660321e7 ┆ 0.040137   │
│ 10026  ┆ 20181031 ┆ null   ┆ 4.660321e7 ┆ 0.034926   │
└────────┴──────────┴────────┴────────────┴────────────┘
shape: (10, 4)
┌────────┬──────────┬──────────┬────────────┐
│ permno ┆ D

In [149]:
def merge_with_lag(sub6_data, ret_data):
    # 1. 确保ret_data中(permno, DATE)是唯一的（如果有重复）
    ret_data = ret_data.unique(subset=['permno', 'DATE'], keep='first')
    
    # 2. 直接合并
    merged_df = sub6_data.join(
        ret_data.select(['permno', 'DATE', 'RET_target']),
        on=['permno', 'DATE'],
        how='left'
    )
    
    # 3. 打印一些统计信息
    print(f"\n合并后的结果：")
    print(f"sub6_data行数: {sub6_data.shape[0]}")
    print(f"合并后行数: {merged_df.shape[0]}")
    print(f"RET_target非空值数量: {merged_df.filter(pl.col('RET_target').is_not_null()).shape[0]}")
    
    return merged_df

In [160]:
def merge_with_lag(sub6_data, ret_data):
    # 1. 确保ret_data中(permno, DATE)是唯一的
    ret_data = ret_data.unique(subset=['permno', 'DATE'], keep='first')
    
    # 2. 对每个permno，将RET_target向前移动一行
    shifted_ret = (ret_data
                  .sort(['permno', 'DATE'])
                  .with_columns([
                      pl.col('RET_target').shift(-1).over(['permno']).alias('RET_target_shifted')
                  ])
                  .drop('RET_target')
                  .rename({'RET_target_shifted': 'RET_target'}))
    
    # 3. 合并到sub6_data
    merged_df = sub6_data.join(
        shifted_ret.select(['permno', 'DATE', 'RET_target']),
        on=['permno', 'DATE'],
        how='left'
    )
    
    # 4. 打印一些统计信息
    print(f"\n合并后的结果：")
    print(f"sub6_data行数: {sub6_data.shape[0]}")
    print(f"合并后行数: {merged_df.shape[0]}")
    print(f"RET_target非空值数量: {merged_df.filter(pl.col('RET_target').is_not_null()).shape[0]}")
    
    return merged_df

In [171]:
# 然后合并数据
result_df = merge_with_lag(sub6_data, ret)


合并后的结果：
sub6_data行数: 762136
合并后行数: 762136
RET_target非空值数量: 754961


In [172]:
data_info(result_df)

(762136, 98)

过滤后的数据 DATE 范围（整数格式）：最小值 = 20110131, 最大值 = 20211231
转换后：最小日期 = 2011-01-31 00:00:00, 最大日期 = 2021-12-31 00:00:00

唯一 permno 数量：10734


In [163]:
specific_permno = 10026

result_f = result_df.filter(pl.col("permno") == specific_permno)

print(f"\npermno {specific_permno} 的数据:")
print(result_f)

# 显示该 permno 的数据行数
print(f"\npermno {specific_permno} 的数据行数:", filtered_df.shape[0])


permno 10026 的数据:
shape: (132, 98)
┌────────┬──────────┬──────────────┬──────────┬───┬──────────┬───────────┬──────┬────────────┐
│ permno ┆ DATE     ┆ mvel1        ┆ beta     ┆ … ┆ std_turn ┆ zerotrade ┆ sic2 ┆ RET_target │
│ ---    ┆ ---      ┆ ---          ┆ ---      ┆   ┆ ---      ┆ ---       ┆ ---  ┆ ---        │
│ i64    ┆ i64      ┆ f64          ┆ f64      ┆   ┆ f64      ┆ f64       ┆ f64  ┆ f64        │
╞════════╪══════════╪══════════════╪══════════╪═══╪══════════╪═══════════╪══════╪════════════╡
│ 10026  ┆ 20110131 ┆ 894466.11112 ┆ 0.563942 ┆ … ┆ 1.825713 ┆ 3.2199e-8 ┆ 20.0 ┆ 0.035319   │
│ 10026  ┆ 20110228 ┆ 788710.39267 ┆ 0.564159 ┆ … ┆ 1.370279 ┆ 3.1404e-8 ┆ 20.0 ┆ 0.073175   │
│ 10026  ┆ 20110331 ┆ 816566.89267 ┆ 0.581094 ┆ … ┆ 2.265086 ┆ 3.1326e-8 ┆ 20.0 ┆ 0.079669   │
│ 10026  ┆ 20110429 ┆ 874513.52433 ┆ 0.581966 ┆ … ┆ 2.176758 ┆ 2.4832e-8 ┆ 20.0 ┆ 0.013774   │
│ 10026  ┆ 20110531 ┆ 944438.87433 ┆ 0.57539  ┆ … ┆ 0.763993 ┆ 4.6153e-8 ┆ 20.0 ┆ -0.030134  │
│ …      ┆ …  

In [164]:
result_f = result_f.filter(pl.col("DATE") >= 20180131)
print(result_f.head(10))
filtered_vgnn_data_df = vgnn_data.filter(pl.col("permno") == specific_permno)
print(filtered_vgnn_data_df[['permno', 'DATE', 'mvel1', 'RET_target']].head(10))

shape: (10, 98)
┌────────┬──────────┬────────────┬──────────┬───┬──────────┬───────────┬──────┬────────────┐
│ permno ┆ DATE     ┆ mvel1      ┆ beta     ┆ … ┆ std_turn ┆ zerotrade ┆ sic2 ┆ RET_target │
│ ---    ┆ ---      ┆ ---        ┆ ---      ┆   ┆ ---      ┆ ---       ┆ ---  ┆ ---        │
│ i64    ┆ i64      ┆ f64        ┆ f64      ┆   ┆ f64      ┆ f64       ┆ f64  ┆ f64        │
╞════════╪══════════╪════════════╪══════════╪═══╪══════════╪═══════════╪══════╪════════════╡
│ 10026  ┆ 20180131 ┆ 2.8344e6   ┆ 0.643161 ┆ … ┆ 1.670387 ┆ 3.4546e-8 ┆ 20.0 ┆ -0.029688  │
│ 10026  ┆ 20180228 ┆ 2.5858e6   ┆ 0.617265 ┆ … ┆ 2.317696 ┆ 2.2002e-8 ┆ 20.0 ┆ 0.019951   │
│ 10026  ┆ 20180329 ┆ 2.5090e6   ┆ 0.585592 ┆ … ┆ 1.602548 ┆ 2.5811e-8 ┆ 20.0 ┆ 0.006224   │
│ 10026  ┆ 20180430 ┆ 2.5533e6   ┆ 0.649596 ┆ … ┆ 2.013959 ┆ 2.7490e-8 ┆ 20.0 ┆ 0.030638   │
│ 10026  ┆ 20180531 ┆ 2.5698e6   ┆ 0.673609 ┆ … ┆ 1.433237 ┆ 3.1137e-8 ┆ 20.0 ┆ 0.079791   │
│ 10026  ┆ 20180629 ┆ 2.6486e6   ┆ 0.671337 ┆ … ┆ 1.12

In [173]:
# 假设 vgnn_data 和 sub6_data 都是 Polars DataFrame
# 获取小表中唯一的 permno 值
permno_list = vgnn_data['permno'].unique()

# 使用 Polars 的 is_in 方法筛选大表
filtered_large_df = result_df.filter(pl.col('permno').is_in(permno_list))

# 查看筛选后大表中唯一 permno 的数量
filtered_permno_count = filtered_large_df['permno'].n_unique()
print(f"筛选后大表中 permno 的数量: {filtered_permno_count}")

筛选后大表中 permno 的数量: 3050


In [179]:
data_info(filtered_large_df)

(401830, 98)

过滤后的数据 DATE 范围（整数格式）：最小值 = 20110131, 最大值 = 20211231
转换后：最小日期 = 2011-01-31 00:00:00, 最大日期 = 2021-12-31 00:00:00

唯一 permno 数量：3050


In [None]:
filtered_large_df.write_csv("data/preprocess_dataset.csv")

In [3]:
import pandas as pd

# 读取 Excel 文件，默认读取第一个工作表
df = pd.read_excel("data/company_relationships2.xls")

# 将数据写入 CSV 文件，不保存行索引（如果需要行索引可将 index 参数设为 True）
df.to_csv("data/preprocess_data/firms_relation.csv", index=False)


In [26]:
relation_data = utils.load_data('data/preprocess_data/firms_relation.csv')

In [28]:
print(relation_data.shape)
print(relation_data.columns)
print(relation_data[['Exchange:Ticker']])

(5177, 12)
['Company Name', 'Exchange:Ticker', 'Company Type', 'Exchanges [Primary Listing]', 'Business Relationships (All History)', 'Strategic Alliances', 'Suppliers', 'Competitors', 'Industry Classifications', 'Headquarters - Country/Region', 'Security Tickers', 'Excel Company ID']
shape: (5_177, 1)
┌─────────────────┐
│ Exchange:Ticker │
│ ---             │
│ str             │
╞═════════════════╡
│ NasdaqGS:FLWS   │
│ NasdaqGS:TXG    │
│ NasdaqGM:YI     │
│ NasdaqGS:YQ     │
│ NasdaqCM:ATNF   │
│ …               │
│ NYSE:ZWS        │
│ NasdaqCM:CNET   │
│ NasdaqGS:ZYME   │
│ NasdaqGS:ZYXI   │
│ NasdaqCM:ZVSA   │
└─────────────────┘


In [29]:

# 假设列名为 "Exchange:Ticker"
# relation_data = relation_data.with_columns(
#     pl.col("Exchange:Ticker")
#     .str.extract(r":([\w]+)")   # 提取冒号后面的字母或数字组合
#     .alias("Exchange:Ticker")
# )

# 提取冒号后面的部分，即股票代码
relation_data = relation_data.with_columns(
    pl.col("Exchange:Ticker")
    .str.extract(r":(.*)")   # 正则表达式提取冒号后面的所有内容
    .alias("Exchange:Ticker")
)

print(relation_data[['Exchange:Ticker']])
# # 保存处理后的 CSV 文件
# df.to_csv("processed_file.csv", index=False)


shape: (5_177, 1)
┌─────────────────┐
│ Exchange:Ticker │
│ ---             │
│ str             │
╞═════════════════╡
│ FLWS            │
│ TXG             │
│ YI              │
│ YQ              │
│ ATNF            │
│ …               │
│ ZWS             │
│ CNET            │
│ ZYME            │
│ ZYXI            │
│ ZVSA            │
└─────────────────┘


In [None]:
print(relation_data.head(10))
# relation_data.write_csv('data/preprocess_data/firms_relation_ticker.csv')

shape: (10, 12)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Company   ┆ Exchange: ┆ Company   ┆ Exchanges ┆ … ┆ Industry  ┆ Headquart ┆ Security  ┆ Excel    │
│ Name      ┆ Ticker    ┆ Type      ┆ [Primary  ┆   ┆ Classific ┆ ers - Cou ┆ Tickers   ┆ Company  │
│ ---       ┆ ---       ┆ ---       ┆ Listing]  ┆   ┆ ations    ┆ ntry/Regi ┆ ---       ┆ ID       │
│ str       ┆ str       ┆ str       ┆ ---       ┆   ┆ ---       ┆ on        ┆ str       ┆ ---      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ ---       ┆           ┆ str      │
│           ┆           ┆           ┆           ┆   ┆           ┆ str       ┆           ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 1-800-FLO ┆ FLWS      ┆ Public    ┆ Nasdaq    ┆ … ┆ Catalog   ┆ United    ┆ NasdaqGS: ┆ IQ24085  │
│ WERS.COM, ┆           ┆ Company   ┆ Global    ┆   ┆ Flowers,  ┆ States   