# 資料前處理：
1. 將 CSR embedding 分組成 [company × year] 的序列資料。
2. 每家公司形成一條 CSR 時序（CSR1, CSR2, ..., CSR6）。

In [1]:
import pandas as pd
import torch
from torch.nn.utils.rnn import pad_sequence

In [2]:
csr_embeddings_forward_count = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/csr_embeddings_forward_count.csv")

In [3]:
csr_embeddings_forward_count.head()

Unnamed: 0,folder_name,ticker,year,total_5yr_forward_citations,within_5_years,patents_count,dim_0,dim_1,dim_2,dim_3,...,dim_1014,dim_1015,dim_1016,dim_1017,dim_1018,dim_1019,dim_1020,dim_1021,dim_1022,dim_1023
0,NASDAQ_AAPL_2014,AAPL,2014,910.0,121.0,122.0,-0.09183,0.29304,-0.179719,0.083607,...,-0.086017,-0.40382,0.282563,0.331965,0.290067,-0.223406,-0.325722,0.009091,-0.170071,0.163041
1,NASDAQ_AAPL_2015,AAPL,2015,892.0,120.0,123.0,-0.085034,0.276596,-0.139275,0.068203,...,-0.090395,-0.406836,0.278167,0.320248,0.292318,-0.212523,-0.331629,-0.030166,-0.150414,0.180043
2,NASDAQ_AAPL_2016,AAPL,2016,801.0,93.0,96.0,-0.067159,0.277524,-0.148442,0.092723,...,-0.109939,-0.41627,0.282748,0.334449,0.296767,-0.216122,-0.327691,-0.037364,-0.151454,0.178229
3,NASDAQ_AAPL_2017,AAPL,2017,381.0,86.0,88.0,-0.053141,0.259755,-0.160064,0.098964,...,-0.110439,-0.411443,0.307153,0.298774,0.284501,-0.213012,-0.317366,-0.034005,-0.147262,0.17549
4,NASDAQ_AAPL_2018,AAPL,2018,144.0,26.0,29.0,-0.070878,0.338528,-0.095608,0.093072,...,-0.099431,-0.410019,0.305583,0.368817,0.274357,-0.190008,-0.347579,-0.00454,-0.125971,0.124562


In [4]:
grouped = csr_embeddings_forward_count.groupby('ticker')

x_seq = []
y_seq = []
seq_len = []

In [5]:
for ticker, group in grouped:
    group = group.sort_values('year')
    embeddings = torch.tensor(group[[f'dim_{i}' for i in range(1024)]].values, dtype=torch.float32)
    targets = torch.tensor(group[['total_5yr_forward_citations', 'within_5_years', 'patents_count']].values, dtype=torch.float32)

    x_seq.append(embeddings)
    y_seq.append(targets)
    seq_len.append(len(group))

In [6]:
x_padded = pad_sequence(x_seq, batch_first=True)
y_padded = pad_sequence(y_seq, batch_first=True)
mask = torch.tensor([[1] * l + [0] * (x_padded.size(1) - l) for l in seq_len], dtype=torch.float32)

In [7]:
torch.save({
    'x': x_padded,
    'y': y_padded,
    'mask': mask
}, 'data/processed_data.pt')

In [8]:
# 計算每家公司的年份跨度（最大年 - 最小年 + 1）
year_span = csr_embeddings_forward_count.groupby('ticker')['year'].agg(['min', 'max'])
year_span['span'] = year_span['max'] - year_span['min'] + 1

# 找出年份跨度最長的公司
longest_span = year_span.sort_values(by='span', ascending=False)

# 顯示結果
print("📈 年份跨度最長的前幾家公司：")
print(longest_span.head())


📈 年份跨度最長的前幾家公司：
         min   max  span
ticker                  
DELL    1998  2021    24
PG      1999  2021    23
A       2000  2021    22
SBUX    2001  2021    21
CVX     2002  2021    20
