# Delete report with year2 (overlapping)

In [48]:
import pandas as pd

In [49]:
report_all = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_all.csv")

In [50]:
report_all_without_overlap = report_all[report_all['year2'].isna()]
report_all_without_overlap.drop(columns=['year2'], inplace=True)
report_all_without_overlap.to_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_all_without_overlap.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  report_all_without_overlap.drop(columns=['year2'], inplace=True)


# Merge csr embeddings

In [10]:
import pandas as pd
import re

In [11]:
csr = pd.read_csv("../output_dataset/csr_embeddings_all.csv")

In [12]:
pattern = re.compile(r'_(\d+)\b')  # 抓出底線開頭的數字，例如 _2018 -> 2018

def has_invalid_year_format(name):
    matches = pattern.findall(name)
    return any(len(m) != 4 for m in matches)

# 找出不符合的行
mask = csr['file_name'].apply(has_invalid_year_format)

# 印出不合規的 file_name
print(csr[mask]['file_name'])

Series([], Name: file_name, dtype: object)


In [13]:
pattern = re.compile(r'_(\d+)\b')

def fix_invalid_year_format(name):
    # 找出所有底線後的數字
    matches = re.findall(r'_(\d+)', name)
    
    if not matches:
        return name

    # 將每個數字轉成前4位
    trimmed_years = [m[:4] for m in matches]

    # 用前面的部分（不含年份）+ 修正後的年份串起來
    prefix = re.split(r'_(\d+)', name)[0]  # 保留名稱前綴（如 NASDAQ_AMZN）
    return f"{prefix}_" + "_".join(trimmed_years)

# 套用修正
csr['file_name'] = csr['file_name'].apply(fix_invalid_year_format)

In [14]:
# 找出包含兩個年份的
# 定義正則表達式：找出 file_name 中出現兩個年份（例如 2007_2008）
pattern = re.compile(r'.*\d{4}_\d{4}.*')

# 找出 file_name 符合條件的行
mask = csr['file_name'].apply(lambda x: bool(pattern.match(x)))

# 印出要刪除的行（可選）
print("將被刪除的檔案：")
print(csr[mask]['file_name'])

# 刪除這些行
csr = csr[~mask].reset_index(drop=True)

將被刪除的檔案：
165      NASDAQ_AAON_2018_2019
166      NASDAQ_AAON_2019_2020
167      NASDAQ_AAON_2020_2021
168      NASDAQ_AAON_2021_2022
176      NASDAQ_AAWW_2018_2019
191       NASDAQ_ADI_2012_2013
192       NASDAQ_ADI_2014_2015
196      NASDAQ_ADSK_2010_2011
197      NASDAQ_ADSK_2011_2012
198      NASDAQ_ADSK_2012_2013
199      NASDAQ_ADSK_2013_2014
200      NASDAQ_ADSK_2014_2015
238      NASDAQ_ALKS_2018_2019
240      NASDAQ_ALKS_2019_2020
257       NASDAQ_AMD_2010_2011
258       NASDAQ_AMD_2011_2012
259       NASDAQ_AMD_2012_2013
260       NASDAQ_AMD_2013_2014
261       NASDAQ_AMD_2014_2015
262       NASDAQ_AMD_2019_2020
263       NASDAQ_AMD_2020_2021
264       NASDAQ_AMD_2021_2022
280      NASDAQ_ANSS_2017_2018
334      NASDAQ_BRKS_2018_2019
335      NASDAQ_BRKS_2019_2020
336      NASDAQ_BRKS_2020_2021
353      NASDAQ_CBSH_2016_2017
354      NASDAQ_CBSH_2018_2019
414      NASDAQ_CONE_2019_2020
416      NASDAQ_CONE_2021_2022
431      NASDAQ_CRMT_2020_2021
432      NASDAQ_CRMT_2021_2022

In [16]:
# 抓出所有 file_name 中的數字（不限底線），再檢查是否都 <= 2019
def all_years_leq_2019(name):
    years = re.findall(r'\d+', name)  # 抓出所有數字
    return all(int(y) <= 2019 for y in years)

# 建立條件 mask
mask = csr['file_name'].apply(all_years_leq_2019)

# 篩選出符合條件的資料
csr_filtered = csr[mask].reset_index(drop=True)

# 可選：印出被保留的 file_name
print(csr_filtered['file_name'])

0        NASDAQ_AMZN_2019
1        NASDAQ_BRKS_2019
2        NASDAQ_GILD_2016
3        NASDAQ_GILD_2017
4        NASDAQ_GILD_2018
5        NASDAQ_GILD_2019
6        NASDAQ_MIDD_2019
7           NYSE_AES_2015
8           NYSE_AES_2016
9           NYSE_AES_2017
10        NYSE_AKO-B_2014
11        NYSE_AKO-B_2015
12        NYSE_AKO-B_2016
13        NYSE_AKO-B_2017
14        NYSE_AKO-B_2018
15        NYSE_AKO-B_2019
16          NYSE_AVY_2019
17          NYSE_CDP_2014
18          NYSE_CDP_2016
19          NYSE_CDP_2017
20          NYSE_CDP_2018
21          NYSE_CDP_2019
22          NYSE_CHU_2016
23          NYSE_CHU_2017
24          NYSE_CHU_2018
25          NYSE_CHU_2019
26          NYSE_COR_2016
27          NYSE_COR_2017
28          NYSE_COR_2018
29          NYSE_COR_2019
30          NYSE_CRM_2012
31          NYSE_CRM_2014
32          NYSE_CRM_2016
33          NYSE_CRM_2017
34          NYSE_CRM_2018
35          NYSE_CRM_2019
36          NYSE_CTB_2014
37          NYSE_CTB_2016
38          

In [17]:
csr_filtered.to_csv("../output_dataset/csr_embeddings_leq2019.csv", index=False)

# Filing csr report

In [18]:
import pandas as pd
pd.set_option('display.max_rows', None)

In [19]:
csr = pd.read_csv("../output_dataset/csr_embeddings_leq2019.csv")

In [20]:
csr.head()

Unnamed: 0,file_name,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_1014,dim_1015,dim_1016,dim_1017,dim_1018,dim_1019,dim_1020,dim_1021,dim_1022,dim_1023
0,NASDAQ_AMZN_2019,-0.013089,0.263785,-0.097623,-0.03307,0.000627,0.072135,-0.044153,0.016454,0.148158,...,-0.089429,-0.371656,0.322189,0.287161,0.36437,-0.222883,-0.286277,-0.081639,-0.161185,0.141298
1,NASDAQ_BRKS_2019,-0.030708,0.24946,-0.048373,-0.038293,-0.003813,0.086812,-0.120732,-0.073833,0.102131,...,0.048801,-0.301498,0.290007,0.362251,0.411211,-0.097284,-0.267405,-0.008784,-0.136006,0.132663
2,NASDAQ_GILD_2016,-0.086015,0.255328,-0.011454,0.026999,-0.120548,0.098053,-0.151887,-0.091115,-0.018867,...,-0.006605,-0.384335,0.256266,0.356898,0.28226,-0.178336,-0.302124,-0.008873,-0.080819,0.121339
3,NASDAQ_GILD_2017,-0.065964,0.268716,0.020811,0.020799,-0.161803,0.068956,-0.13644,-0.065431,-0.03825,...,-0.014895,-0.407098,0.259158,0.357559,0.246257,-0.187174,-0.374117,0.010569,-0.065381,0.150525
4,NASDAQ_GILD_2018,-0.022194,0.237016,-0.015546,-0.03613,-0.12012,0.11003,-0.120481,-0.068459,0.011013,...,-0.030991,-0.394415,0.276254,0.393782,0.290036,-0.19376,-0.362832,0.005057,-0.076889,0.169497


In [32]:
# 假設你的原始 DataFrame 是 csr
df = csr.copy()

# 拆分 file_name 成 exchange、ticker、year
df[['exchange', 'ticker', 'year']] = df['file_name'].str.extract(r'([^_]+)_([^_]+)_(\d+)$')
df = df[df['year'].notna()].copy()
df['year'] = df['year'].astype(int)

# 標記原始資料為非遞補
df['is_imputed'] = False

# 建立一個新表用來儲存補齊的 rows
rows_to_add = []

# 依據 exchange + ticker 分組處理
for (exchange, ticker), group in df.groupby(['exchange', 'ticker']):
    years = sorted(group['year'].tolist())
    all_years = list(range(min(years), max(years) + 1))

    existing_years = set(years)
    for y in all_years:
        if y not in existing_years:
            prev_year = y - 1
            prev_row = group[group['year'] == prev_year]
            if not prev_row.empty:
                # 複製前一年資料並更新年份與 file_name
                new_row = prev_row.copy()
                new_row['year'] = y
                new_row['file_name'] = f'{exchange}_{ticker}_{y}'
                new_row['is_imputed'] = True
                rows_to_add.append(new_row)

# 合併所有資料（原始 + 遞補）
if rows_to_add:
    imputed_df = pd.concat(rows_to_add, ignore_index=True)
    result_df = pd.concat([df, imputed_df], ignore_index=True)
else:
    result_df = df.copy()

# 最後依照 exchange、ticker、year 排序並清理暫時欄位
result_df = result_df.sort_values(by=['exchange', 'ticker', 'year']).reset_index(drop=True)
result_df = result_df.drop(columns=['exchange', 'ticker', 'year'])

# 顯示結果（包括 file_name、is_imputed 和所有維度）
print(result_df[['file_name', 'is_imputed']])

# 儲存標註檔（file_name + is_imputed）
result_df[['file_name', 'is_imputed']].to_csv('../output_dataset/csr_embeddings_leq2019_flag.csv', index=False)

# 儲存向量資料檔（file_name + dim_0 ~ dim_1023）
dim_cols = ['file_name'] + [col for col in result_df.columns if col.startswith('dim_')]
result_df[dim_cols].to_csv('../output_dataset/csr_embeddings_leq2019_filled.csv', index=False)


              file_name  is_imputed
0       NASDAQ_AAL_2007       False
1       NASDAQ_AAL_2008       False
2       NASDAQ_AAL_2009       False
3       NASDAQ_AAL_2010        True
4       NASDAQ_AAL_2011       False
5       NASDAQ_AAL_2012       False
6       NASDAQ_AAL_2013       False
7       NASDAQ_AAL_2014       False
8       NASDAQ_AAL_2015       False
9       NASDAQ_AAL_2016       False
10      NASDAQ_AAL_2017       False
11      NASDAQ_AAL_2018       False
12      NASDAQ_AAL_2019       False
13     NASDAQ_AAPL_2014       False
14     NASDAQ_AAPL_2015       False
15     NASDAQ_AAPL_2016       False
16     NASDAQ_AAPL_2017       False
17     NASDAQ_AAPL_2018       False
18     NASDAQ_AAPL_2019       False
19     NASDAQ_ACIW_2019       False
20     NASDAQ_ADBE_2013       False
21     NASDAQ_ADBE_2014       False
22     NASDAQ_ADBE_2015       False
23     NASDAQ_ADBE_2016       False
24     NASDAQ_ADBE_2017       False
25     NASDAQ_ADBE_2018       False
26     NASDAQ_ADBE_2019     