# 看年份有重疊的企業 CSR 報告，對應的 CSR 分數差距之統計數據

## 分割名稱

In [1]:
import pandas as pd

In [2]:
hm_NASDAQ_divided = pd.read_csv('../output_dataset/handmade_features_all_v1_corrected_txt.csv')

In [4]:
# 分割 folder_name 欄位
split_cols = hm_NASDAQ_divided['folder_name'].str.split('_', expand=True)

# 指定新欄位名稱
hm_NASDAQ_divided['exchange'] = split_cols[0]
hm_NASDAQ_divided['ticker'] = split_cols[1]
hm_NASDAQ_divided['year1'] = split_cols[2]
hm_NASDAQ_divided['year2'] = split_cols[3] if split_cols.shape[1] > 3 else ''

# 如果只有三個部分的話，補空值
if split_cols.shape[1] == 3:
    hm_NASDAQ_divided['year2'] = ''
hm_NASDAQ_divided.to_csv('../output_dataset/handmade_features_all_v1_corrected_txt_divided.csv', index=False)

## 找出前後報告有重疊年份的

In [9]:
import pandas as pd

In [10]:
hm_divided = pd.read_csv('../output_dataset/handmade_features_all_v1_corrected_txt_divided.csv')

In [11]:
# drop those rows with NaN
hm_divided_cover = hm_divided.dropna()[['exchange', 'ticker', 'year1', 'year2']]
hm_divided_cover.sort_values(by=['exchange', 'ticker', 'year1', 'year2'], inplace=True)

In [12]:
def get_continuous_coverage(df):
    df = df.sort_values(by=['ticker', 'year1']).copy()

    df['year1'] = pd.to_numeric(df['year1'], errors='coerce')
    df['year1'] = pd.to_numeric(df['year1'], errors='coerce')
    
    df['prev_year'] = df.groupby('ticker')['year1'].shift(1)
    df['year_diff'] = df['year1'] - df['prev_year']

    # 找出連續區段的編號
    df['continuous_group'] = (df['year_diff'] != 1).cumsum()

    # 針對每個 ticker，再依據 continuous_group 分群，保留群組長度 ≥ 2 的
    def filter_group(g):
        group_counts = g.groupby('continuous_group').size()
        valid_groups = group_counts[group_counts >= 2].index
        return g[g['continuous_group'].isin(valid_groups)]

    df = df.groupby('ticker').apply(filter_group).reset_index(drop=True)
    return df.drop(columns=['prev_year', 'year_diff', 'continuous_group'])

# 使用在你的 dataframe 上
hm_divided_overlapping = get_continuous_coverage(hm_divided_cover)

  df = df.groupby('ticker').apply(filter_group).reset_index(drop=True)


In [13]:
hm_divided_overlapping.to_csv('../output_dataset/overlapping_report_v1.csv', index=False)

## 算出有重疊年份報告的後年-前年分數差

In [14]:
import pandas as pd
overlapping_report = pd.read_csv('../output_dataset/overlapping_report_v1.csv')
overlapping_report.head(5)

Unnamed: 0,exchange,ticker,year1,year2
0,NASDAQ,AAON,2018,2019
1,NASDAQ,AAON,2019,2020
2,NASDAQ,AAON,2020,2021
3,NASDAQ,AAON,2021,2022
4,NASDAQ,ADSK,2010,2011


In [15]:
common_score = pd.read_csv('../CSR_score/common_score_ESG_all.csv')
common_score.head(5)

Unnamed: 0,ticker,2005,2006,2007,2008,2009,2010,2011,2012,2013,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,A,0.0,-23.470974,-26.148861,-29.756141,-22.694779,-23.446583,-22.304174,-18.211183,-13.161583,...,-14.982541,-25.223024,-24.518057,-32.378512,-20.763049,-22.644505,-27.349438,-26.377082,0.0,0.0
1,AA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-20.573605,-19.01964,-16.708301,-12.113294,-7.47721,-8.461853,-7.637923,-4.331176,0.0
2,AAL,-21.859142,-3.090015,-32.270374,-42.85253,-38.978609,-48.721519,-51.990725,-55.278625,-51.441634,...,-32.727042,-29.976215,-19.834347,-20.556871,-19.166368,-20.265235,-18.718876,-14.643682,0.0,0.0
3,AAOI,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,12.733437,14.191309,14.873884,14.947325,19.507328,25.54949,18.07804,0.0,0.0
4,AAON,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.1096,9.907815,12.791927,8.343231,-3.828476,-6.211471,-3.948939,-8.713777,1.430064,0.0


In [16]:
import pandas as pd

# 確保 ticker 格式一致（去除空格 & 轉大寫）
overlapping_report['ticker'] = overlapping_report['ticker'].str.strip().str.upper()
common_score['ticker'] = common_score['ticker'].str.strip().str.upper()

# 找出 missing_tickers
missing_tickers = set(overlapping_report['ticker'].unique()) - set(common_score['ticker'].unique())
print("\n🚨 無法在 common_score 中找到的 ticker:", missing_tickers)

# 移除 `missing_tickers`，只保留 common_score 有的 ticker
filtered_overlapping_report = overlapping_report[~overlapping_report['ticker'].isin(missing_tickers)].copy()
filtered_overlapping_report


🚨 無法在 common_score 中找到的 ticker: set()


Unnamed: 0,exchange,ticker,year1,year2
0,NASDAQ,AAON,2018,2019
1,NASDAQ,AAON,2019,2020
2,NASDAQ,AAON,2020,2021
3,NASDAQ,AAON,2021,2022
4,NASDAQ,ADSK,2010,2011
...,...,...,...,...
114,NASDAQ,VRTU,2021,2022
115,NASDAQ,XLNX,2015,2016
116,NASDAQ,XLNX,2016,2017
117,NASDAQ,XLNX,2018,2019


In [17]:
# 轉換 common_score 為 long format
common_score_long = common_score.melt(id_vars=['ticker'], var_name='year', value_name='GWscore')
common_score_long['year'] = common_score_long['year'].astype(int)  # 確保 year 是 int

# 合併 year1 的 GWscore
filtered_overlapping_report = filtered_overlapping_report.merge(
    common_score_long, left_on=['ticker', 'year1'], right_on=['ticker', 'year'], how='left'
)
filtered_overlapping_report.rename(columns={'GWscore': 'GWscore_year1'}, inplace=True)
filtered_overlapping_report.drop(columns=['year'], inplace=True)  # 刪除額外的 year 欄位

# 合併 year2 的 GWscore
filtered_overlapping_report = filtered_overlapping_report.merge(
    common_score_long, left_on=['ticker', 'year2'], right_on=['ticker', 'year'], how='left'
)
filtered_overlapping_report.rename(columns={'GWscore': 'GWscore_year2'}, inplace=True)
filtered_overlapping_report.drop(columns=['year'], inplace=True)  # 刪除額外的 year 欄位

# 計算 GWscore_diff
filtered_overlapping_report.loc[:, 'GWscore_diff'] = (
    filtered_overlapping_report['GWscore_year2'] - filtered_overlapping_report['GWscore_year1']
).abs()

filtered_overlapping_report

Unnamed: 0,exchange,ticker,year1,year2,GWscore_year1,GWscore_year2,GWscore_diff
0,NASDAQ,AAON,2018,2019,8.343231,-3.828476,12.171707
1,NASDAQ,AAON,2019,2020,-3.828476,-6.211471,2.382995
2,NASDAQ,AAON,2020,2021,-6.211471,-3.948939,2.262532
3,NASDAQ,AAON,2021,2022,-3.948939,-8.713777,4.764838
4,NASDAQ,ADSK,2010,2011,-16.220779,-12.434045,3.786734
...,...,...,...,...,...,...,...
114,NASDAQ,VRTU,2021,2022,0.000000,0.000000,0.000000
115,NASDAQ,XLNX,2015,2016,-6.982054,-9.695660,2.713606
116,NASDAQ,XLNX,2016,2017,-9.695660,-6.623809,3.071851
117,NASDAQ,XLNX,2018,2019,-8.218435,-8.202509,0.015925


In [18]:
filtered_overlapping_report['GWscore_diff'].describe()

count    119.000000
mean       6.396703
std        7.073971
min        0.000000
25%        1.981817
50%        3.755626
75%        8.337943
max       41.766601
Name: GWscore_diff, dtype: float64

In [19]:
filtered_overlapping_report.to_csv('../output_dataset/overlapping_report_score_v1.csv', index=False)

# Describe of all the greenwashing score

In [12]:
import pandas as pd

In [13]:
common_score = pd.read_csv('../CSR_score/common_score_ESG.csv')

In [14]:
common_score.head(5)

Unnamed: 0,ticker,2005,2006,2007,2008,2009,2010,2011,2012,2013,...,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
0,PMT,0.0,0.0,-14.460174,-28.895211,-27.795259,3.226296,8.438004,5.548695,6.044652,...,15.010161,19.20571,17.039996,18.401621,14.321301,18.76127,25.068604,23.502445,-10.68042,0.0
1,ACVA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.787916,-28.256582,0.0
2,TRP,-1.973488,5.840225,-5.171804,-24.878843,-28.903738,-28.397471,-32.70743,-24.143747,-24.73266,...,-8.441327,2.133232,13.172975,11.473292,15.849688,14.023199,18.216672,24.16478,-51.588543,0.0
3,VRTX,-8.013624,-31.797662,-7.561376,1.085975,8.569791,9.99545,7.01022,11.440721,13.915639,...,5.861065,-0.139335,-6.616973,-25.312459,-14.515943,1.044804,-8.244195,-6.610504,0.0,0.0
4,VTR,0.0,0.0,-3.601887,-3.202949,-4.292841,1.42662,1.760391,-9.865475,-23.884559,...,-25.346508,-34.884566,-28.779756,-26.241391,-24.608606,-23.180766,-21.341284,-23.251231,-79.003113,0.0


In [15]:
common_score_abs = common_score.set_index('ticker').abs()
common_score_abs.head(5)

Unnamed: 0_level_0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
PMT,0.0,0.0,14.460174,28.895211,27.795259,3.226296,8.438004,5.548695,6.044652,13.201703,15.010161,19.20571,17.039996,18.401621,14.321301,18.76127,25.068604,23.502445,10.68042,0.0
ACVA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.787916,28.256582,0.0
TRP,1.973488,5.840225,5.171804,24.878843,28.903738,28.397471,32.70743,24.143747,24.73266,18.111797,8.441327,2.133232,13.172975,11.473292,15.849688,14.023199,18.216672,24.16478,51.588543,0.0
VRTX,8.013624,31.797662,7.561376,1.085975,8.569791,9.99545,7.01022,11.440721,13.915639,5.374071,5.861065,0.139335,6.616973,25.312459,14.515943,1.044804,8.244195,6.610504,0.0,0.0
VTR,0.0,0.0,3.601887,3.202949,4.292841,1.42662,1.760391,9.865475,23.884559,28.567191,25.346508,34.884566,28.779756,26.241391,24.608606,23.180766,21.341284,23.251231,79.003113,0.0


In [16]:
common_score_abs.describe()

Unnamed: 0,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,2024
count,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0,2370.0
mean,5.938967,5.375761,4.2571,4.939473,5.461566,5.30657,5.379809,5.382028,5.214452,5.364714,6.535165,8.270042,9.578978,10.419808,11.02318,12.411921,11.702094,11.660275,32.762033,2.558816
std,14.288658,13.267737,11.656376,12.087813,12.497105,12.02425,11.977317,12.0344,11.542797,11.932818,11.632066,11.599794,10.778178,10.813538,10.81075,11.458918,9.909229,9.710542,24.533464,11.549895
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.638298,2.125945,2.95728,4.484143,4.344596,4.46222,11.100448,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.110837,6.979703,7.828341,8.722066,9.857581,9.659819,9.616177,28.662918,0.0
75%,0.0,0.0,0.0,0.0,2.60547,3.311789,4.332245,4.393768,4.302041,4.691329,9.420167,12.408532,14.962984,15.361736,15.653885,16.521235,16.273066,16.592293,52.146376,0.0
max,83.167035,85.662887,86.415671,83.830697,86.157347,90.235591,86.876839,88.228327,85.536586,81.970875,91.059522,86.360487,83.964389,84.547873,83.991328,86.622869,70.42061,70.8549,94.777588,91.2791


In [18]:
summary = common_score_abs.values.flatten()
summary = summary[(summary != 0) & (~pd.isnull(summary))]  # 過濾掉 0 和 NaN

# 計算描述統計
result = pd.Series(summary).describe()
# print(pd.Series(summary).max())
print(result)

count    24097.000000
mean        16.674952
std         15.978122
min          0.001741
25%          5.675558
50%         12.025461
75%         21.639404
max         94.777588
dtype: float64
