In [3]:
import pandas as pd
import numpy as np

# 讀取數據
data = pd.read_csv('../CSR_score/bloomberg_not_include_score.csv')

# 填充缺失的 ticker 欄位
data['ticker'] = data['ticker'].fillna(method='ffill')

# 移除包含 '#N/A Review' 和 '#N/A Invalid Security' 的行
data['2005'] = data['2005'].fillna('#')
sub = '#'
filter = data['2005'].str.contains(sub)
data = data[~filter]

# 計算每個ticker的出現次數，並移除只出現一次的ticker
ticker_counts = data['ticker'].value_counts()
tickers_once = ticker_counts[ticker_counts == 1].index.tolist()
data = data[~data['ticker'].isin(tickers_once)]

# 定義一個函數，將日期格式轉換為年份
def extract_year(value):
    if isinstance(value, str) and '/' in value:
        return value.split('/')[0]  # 提取年份
    return value  # 如果不是日期格式，保持原樣

# 對所有列中的元素應用該函數
df = data.applymap(extract_year)

# 定義完整的年份範圍
year_range = list(map(str, range(2005, 2025)))

# 定義一個空的 DataFrame，用來存放合併後的資料
merged_data = pd.DataFrame()

# 迭代每個 ticker 的配對行
for i in range(0, len(df), 2):
    ticker = df.iloc[i]['ticker']  # 取得 ticker
    years = df.iloc[i][1:]  # 取得年份
    scores = df.iloc[i+1][1:]  # 取得分數

    # 去除重複的年份
    unique_years = years.drop_duplicates()

    # 建立一個新的 DataFrame，將唯一年份與分數結合成一行
    merged_row = pd.DataFrame([scores.values[:len(unique_years)]], columns=unique_years.values)
    merged_row.insert(0, 'ticker', ticker)  # 插入 ticker

    # 確保所有年份都有，缺少的年份填上 -1
    merged_row = merged_row.reindex(columns=['ticker'] + year_range, fill_value=0)
    
    # 合併結果
    merged_data = pd.concat([merged_data, merged_row], ignore_index=True)

# 顯示結果
print(merged_data.head())

# 將合併後的數據保存到csv
merged_data.to_csv('../CSR_score/bloomberg_not_include_score_processed.csv', index=False)

  data['ticker'] = data['ticker'].fillna(method='ffill')
  df = data.applymap(extract_year)


  ticker 2005    2006     2007     2008     2009     2010     2011     2012  \
0   CEQP    0       0        0        0        0        0        0        0   
1   ZEAL    0  6.7096   11.689   11.689  20.3501  25.7922  29.1922  29.2828   
2   LOGI    0       0        0  29.4337  29.4337  32.1597  32.1597  30.2485   
3    PTC    0       0  26.9088  26.9088  26.9088  26.9088  26.9088  27.7437   
4   EGLE    0       0        0        0        0  26.9088  26.9088  26.9088   

      2013  ...     2015     2016     2017     2018     2019     2020  \
0        0  ...  18.4891  24.3436  32.3308  37.4912   51.866  45.8807   
1  29.2828  ...   24.072  30.3088  30.6508  35.2983  35.2983  34.7752   
2  33.4876  ...  43.9091  45.9813  47.8926  49.8541  58.4448   65.768   
3  30.7313  ...  31.8378  29.0313   32.029  44.1605  48.3754   46.434   
4  26.9088  ...  27.9348  27.9348  31.2645  31.2645  31.2645  36.2237   

      2021     2022     2023 2024  
0  56.4128  59.5011        0    0  
1  34.7752  34

In [7]:
nan_data = pd.read_csv('dataset/first_time_nan_tickers_processed.csv')
new_data = pd.read_csv('dataset/first_time_new_tickers_processed.csv')
# 合併兩個數據集
tickers_esg_disclosure_score_1009 = pd.concat([nan_data, new_data], ignore_index=True)
tickers_esg_disclosure_score_1009.to_csv('dataset/tickers_esg_disclosure_score_1009.csv', index=False)

In [10]:
import pandas as pd
ticker = pd.read_csv('dataset/ticker.csv')
ticker_all = ticker['ticker'].tolist()
tickers_esg_disclosure_score_1009 = pd.read_csv('dataset/tickers_esg_disclosure_score_1009.csv')
tickers_esg_disclosure_score_1009 = tickers_esg_disclosure_score_1009['ticker'].tolist()
tickers_notfound_1009 = list(set(ticker_all) - set(tickers_esg_disclosure_score_1009))
tickers_notfound_1009 = pd.DataFrame(tickers_notfound_1009, columns=['ticker'])
tickers_notfound_1009.to_csv('dataset/temp.csv', index=False)
# 將未找到的 ticker 每一個之間空一格 
# 把first_time_ticker裡面的ticker隔一行放一個
with open('dataset/temp.csv', 'r') as f:
    with open('dataset/tickers_notfound_1009.csv', 'w') as f1:
        for line in f:
            f1.write(line)
            f1.write('\n')