# 列出所有的報告

In [2]:
import pandas as pd

In [3]:
hm_NASDAQ_divided = pd.read_csv('../output_dataset/handmade_features_NASDAQ_divided.csv')
hm_NYSE_divided = pd.read_csv('../output_dataset/handmade_features_NYSE_divided.csv')

In [4]:
report_all = pd.concat([hm_NASDAQ_divided, hm_NYSE_divided], axis=0)
report_all = report_all[["folder_name", "exchange", "ticker", "year1",	"year2"]]
report_all.sort_values(by=["folder_name", "exchange", "ticker", "year1", "year2"], inplace=True)

In [5]:
report_all.to_csv('../output_dataset/report_all.csv', index=False)
len(report_all)

6098

# 算出每一年的報告數量

In [8]:
import pandas as pd

In [9]:
report_all = pd.read_csv('../output_dataset/report_all.csv')

In [17]:
# Calculate the number of report in each year
report_all['year1'] = report_all['year1'].astype(str)
groupby_year = report_all.groupby(['year1']).size().reset_index(name='count')
groupby_year_wide = groupby_year.set_index('year1').T
groupby_year_wide.to_csv('../output_dataset/groupby_year.csv', index=False)

## Merge score and report

In [45]:
import pandas as pd
import numpy as np

In [46]:
report_all = pd.read_csv('../output_dataset/report_all.csv')
common_score_ESG_all = pd.read_csv('../CSR_score/common_score_ESG_all.csv')

In [47]:
# drop those rows that ticker in common_score_ESG_all not in report_all
common_ticker = set(report_all['ticker']) & set(common_score_ESG_all['ticker'])
common_score_ESG_with_report = common_score_ESG_all[common_score_ESG_all['ticker'].isin(common_ticker)]
report_with_score = report_all[report_all['ticker'].isin(common_ticker)]

In [48]:
# 確保 ticker 格式一致（去除空格 & 轉大寫）
report_all['ticker'] = report_all['ticker'].str.strip().str.upper()
common_score_ESG_all['ticker'] = common_score_ESG_all['ticker'].str.strip().str.upper()

# 把 year1 和 year2 如果不為空就轉成整數格式
report_all['year1'] = report_all['year1'].replace('', np.nan).astype(float).astype('Int64')
report_all['year2'] = report_all['year2'].replace('', np.nan).astype(float).astype('Int64')

# 找出 missing_tickers
missing_tickers = set(report_all['ticker'].unique()) - set(common_score_ESG_all['ticker'].unique())
print("\n🚨 無法在 common_score_ESG_all 中找到的 ticker:", missing_tickers)

# 移除 `missing_tickers`，只保留 common_score_ESG_all 有的 ticker
filtered_report_all = report_all[~report_all['ticker'].isin(missing_tickers)].copy()
filtered_report_all


🚨 無法在 common_score_ESG_all 中找到的 ticker: {'LOGI', 'SBLK', 'ARNC', 'AHS', 'AAR', 'KEM', 'PANL', 'WF', 'VIST', 'BF', 'PBR', 'DPH', 'MYL', 'GTI', 'SUPV', 'DBRG', 'GSH', 'BVN', 'KB', 'LAZ', 'TSLA', 'CLB', 'DOO', 'CSIQ', 'HASI', 'ALXN', 'MNR', 'SNE', 'NGLS', 'CIT', 'MIC', 'ACH', 'UTX', 'INFO', 'ZMH', 'CUB', 'AZ', 'HDB', 'TETY', 'AMRX', 'RCI', 'MEOH', 'ABEV', 'RDS', 'TVE', 'AU', 'ENIC', 'HON', 'CHT', 'KIM', 'WYND', 'LBTYA', 'LXK', 'PEAK', 'TSE', 'AZPN', 'SHEL', 'CTL', 'PETD', 'IBA', 'ZNH', 'CHL', 'VALE', 'SALT', 'PSA', 'GPS', 'TORM', 'CPIX', 'UUUU', 'DLGS', 'GAM', 'SQM', 'VLRS', 'NHS', 'GOOG', 'WNFM', 'SERV', 'LIN', 'GLOP', 'SGS', 'QVCC', 'CWEN', 'E', 'RELX', 'CINR', 'CRGY', 'LPL', 'KKR', 'Z', 'COLD', 'HPQ', 'JEC', 'FCAU', 'ERIC', 'CRD', 'DBD', 'SPXC', 'LND', 'FMX', 'KNOP', 'FS', 'AGII', 'NBL', 'DPS', 'NVS', 'HFC', 'CMN', 'DNR', 'SYT', 'RDY', 'CAMP', 'SPP', 'TAK', 'SAFM'}


Unnamed: 0,folder_name,exchange,ticker,year1,year2
0,NASDAQ_AAL_2007,NASDAQ,AAL,2007,
1,NASDAQ_AAL_2008,NASDAQ,AAL,2008,
2,NASDAQ_AAL_2009,NASDAQ,AAL,2009,
3,NASDAQ_AAL_2011,NASDAQ,AAL,2011,
4,NASDAQ_AAL_2012,NASDAQ,AAL,2012,
...,...,...,...,...,...
6070,NYSE_ZBH_2020,NYSE,ZBH,2020,
6071,NYSE_ZBH_2021,NYSE,ZBH,2021,
6072,NYSE_ZEN_2019,NYSE,ZEN,2019,
6073,NYSE_ZEN_2020,NYSE,ZEN,2020,


In [49]:
# 轉換 common_score 為 long format
common_score_long = common_score_ESG_all.melt(id_vars=['ticker'], var_name='year', value_name='GWscore')
common_score_long['year'] = common_score_long['year'].astype(int)  # 確保 year 是 int

# 合併 year1 的 GWscore
filtered_report_all = report_all.merge(
    common_score_long, left_on=['ticker', 'year1'], right_on=['ticker', 'year'], how='left'
)
filtered_report_all.rename(columns={'GWscore': 'GWscore_year1'}, inplace=True)
filtered_report_all.drop(columns=['year'], inplace=True)

# 合併 year2 的 GWscore
filtered_report_all = filtered_report_all.merge(
    common_score_long, left_on=['ticker', 'year2'], right_on=['ticker', 'year'], how='left'
)
filtered_report_all.rename(columns={'GWscore': 'GWscore_year2'}, inplace=True)
filtered_report_all.drop(columns=['year'], inplace=True)

# 若 year2 是空，則使用 year1 分數；否則取兩者平均
filtered_report_all['GWscore_final'] = filtered_report_all.apply(
    lambda row: row['GWscore_year1'] if pd.isna(row['year2']) or row['year2'] == '' 
    else (row['GWscore_year1'] + row['GWscore_year2']) / 2,
    axis=1
)

filtered_report_all.rename(columns={'GWscore_final': 'greenwash_score'}, inplace=True)
filtered_report_all.drop(columns=['GWscore_year1', 'GWscore_year2'], inplace=True)
filtered_report_all.dropna(subset=['greenwash_score'], inplace=True)

In [51]:
filtered_report_all.to_csv('../output_dataset/report_all_with_score.csv', index=False)

# 算出每一家公司的 GW score 統計量

In [None]:
import pandas as pd
import numpy as np

In [None]:
report_all = pd.read_csv('../output_dataset/report_all.csv')
common_score_ESG_all = pd.read_csv('../CSR_score/common_score_ESG_all.csv')

In [None]:
# drop those rows that ticker in common_score_ESG_all not in report_all
common_ticker = set(report_all['ticker']) & set(common_score_ESG_all['ticker'])
common_score_ESG_with_report = common_score_ESG_all[common_score_ESG_all['ticker'].isin(common_ticker)]
report_with_score = report_all[report_all['ticker'].isin(common_ticker)]

In [65]:
# find the first year of each ticker
ticker_start = report_with_score['ticker'].unique()
ticker_start = pd.DataFrame(ticker_start, columns=['ticker'])
ticker_start['year1'] = report_with_score.groupby('ticker')['year1'].min().values
ticker_start.to_csv('../output_dataset/ticker_start.csv', index=False)

In [61]:
common_score_ESG_with_report = common_score_ESG_with_report.replace(0, np.nan)
common_score_ESG_with_report = common_score_ESG_with_report.set_index('ticker')

ticker_start = ticker_start.set_index('ticker')
ticker_start['year1'] = ticker_start['year1'].astype(str)

In [64]:
std_result = {}

for ticker in ticker_start.index:
    start_year = ticker_start.loc[ticker, 'year1']

    all_years = common_score_ESG_with_report.columns
    valid_years = [y for y in all_years if y >= start_year]
    scores = common_score_ESG_with_report.loc[ticker, valid_years].dropna()
    std = scores.std()
    std_result[ticker] = std

std_result = pd.DataFrame.from_dict(std_result, orient='index', columns=['std']).reset_index()
std_result.columns = ['ticker', 'std']
std_result.to_csv('../output_dataset/std_result.csv', index=False)

有一些企業沒有std，是因為有報告的起始年份開始沒有分數

# Describe of GW score std 

In [66]:
import pandas as pd
import numpy as np

In [67]:
std_result = pd.read_csv('../output_dataset/std_result.csv')

In [68]:
std_result['std'].describe()

count    999.000000
mean       6.412693
std        4.310507
min        0.173531
25%        3.647040
50%        5.544497
75%        7.858265
max       38.497985
Name: std, dtype: float64