# Merge permno_cusip_ticker_permco and crsp_comp_bdx to get gvkey in it

In [38]:
import pandas as pd

In [39]:
# read the xlsx file
gvkey = pd.read_csv('/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/CRSP_COMP_BDX_GVKEY.csv')
permco = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/PERMNO_CUSIP_TICKER_PERMCO.csv")

In [40]:
print("gvkey shape: ",gvkey.shape)
print("permco shape: ",permco.shape)

gvkey shape:  (11843, 6)
permco shape:  (187338, 18)


In [41]:
print("gvkey columns: ",gvkey.columns)
print("permco columns: ",permco.columns)

gvkey columns:  Index(['COMPANYID', 'PERMCO', 'GVKEY', 'SCORE', 'PREFERRED', 'DUPLICATE'], dtype='object')
permco columns:  Index(['PERMNO', 'FSYM_ID', 'FSYM_ID_KIND', 'PROPER_NAME', 'FSYM_REGIONAL_ID',
       'FSYM_SECURITY_ID', 'FS_PERM_SEC_ID', 'FACTSET_ENTITY_ID',
       'ENTITY_PROPER_NAME', 'CUSIP_FS', 'TICKER_EXCHANGE', 'PERMCO', 'CUSIP',
       'NCUSIP', 'TICKER', 'COMNAM', 'LINK_BDATE', 'LINK_EDATE'],
      dtype='object')


In [42]:
# Step 0: 篩選欄位 & 移除重複列
permco_less = permco[[
    'PERMNO', 'PROPER_NAME', 'ENTITY_PROPER_NAME', 'CUSIP_FS',
    'TICKER_EXCHANGE', 'PERMCO', 'CUSIP', 'NCUSIP', 'TICKER',
    'COMNAM', 'LINK_BDATE', 'LINK_EDATE'
]].copy()

permco_less.drop_duplicates(inplace=True)

# Step 1: 確保 LINK_EDATE 是數值型別（避免 NaN 比較出錯）
permco_less['LINK_EDATE'] = pd.to_numeric(permco_less['LINK_EDATE'], errors='coerce')

# Step 2: 根據 PERMNO 保留 LINK_EDATE 最大的那一筆
latest_permco = permco_less.loc[
    permco_less.groupby('PERMNO')['LINK_EDATE'].idxmax()
].reset_index(drop=True)

In [44]:
# 確保是字串格式
latest_permco['TICKER_EXCHANGE'] = latest_permco['TICKER_EXCHANGE'].astype(str)

# 拆分欄位
latest_permco[['TICKER_SYMBOL', 'EXCHANGE']] = latest_permco['TICKER_EXCHANGE'].str.split('-', expand=True)
latest_permco = latest_permco[latest_permco['EXCHANGE'].isin(['NAS', 'NYS'])].reset_index(drop=True)
latest_permco.drop(columns=['TICKER_EXCHANGE', 'TICKER_SYMBOL', 'LINK_BDATE',	'LINK_EDATE'], inplace=True)
latest_permco

Unnamed: 0,PERMNO,PROPER_NAME,ENTITY_PROPER_NAME,CUSIP_FS,PERMCO,CUSIP,NCUSIP,TICKER,COMNAM,EXCHANGE
0,10002,BancTrust Financial Group Inc.,"BancTrust Financial Group, Inc.",05978R107,7954,05978R10,05978R10,BTFG,BANCTRUST FINANCIAL GROUP INC,NAS
1,10003,Great Ctry Bank Ansonia Ct,,390318103,7957,39031810,39031810,GCBK,GREAT COUNTRY BK ASONIA CT,NAS
2,10006,Acf Industries,,000800102,22156,00080010,00080010,ACF,A C F INDUSTRIES INC,NYS
3,10009,Iroquois Bancorp Inc.,"Iroquois Bancorp, Inc. (New York)",463347104,7965,46334710,46334710,IROQ,IROQUOIS BANCORP INC,NAS
4,10010,Cabot Medical Corporation,Cabot Medical Corp.,127095107,7967,12709510,12709510,CBOT,CABOT MEDICAL CORP,NAS
...,...,...,...,...,...,...,...,...,...,...
16293,93428,"BroadSoft, Inc.","BroadSoft, Inc.",11133B409,53446,11133B40,11133B40,BSFT,BROADSOFT INC,NAS
16294,93429,Cboe Global Markets Inc,"Cboe Global Markets, Inc.",12503M108,53447,12503M10,12503M10,CBOE,C B O E GLOBAL MARKETS INC,NAS
16295,93431,Pactera Technology International Ltd. Sponsore...,Pactera Technology International Ltd.,695255109,53449,69525510,69525510,PACT,PACTERA TECHNOLOGY INTL LTD,NAS
16296,93434,S&W Seed Company,S&W Seed Co.,785135104,53427,78513510,78513510,SANW,S & W SEED CO,NAS


In [45]:
permco_gvkey = pd.merge(latest_permco, gvkey[['COMPANYID', 'PERMCO', 'GVKEY']], how='left', on='PERMCO')
print("permco_gvkey shape: ",permco_gvkey.shape)

permco_gvkey shape:  (16722, 12)


In [46]:
permco_gvkey_without_na = permco_gvkey.dropna(subset=['GVKEY'])
print("permco_gvkey_without_na shape: ",permco_gvkey_without_na.shape)

permco_gvkey_without_na shape:  (7669, 12)


In [47]:
permco_gvkey_without_na.to_csv('/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/permno_cusip_ticker_permco_gvkey_without_na.csv', index=False)

## Merge green patent count and forward citation

In [None]:
import pandas as pd

In [None]:
forward = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/total_5yr_forward_citations_current.csv")
count = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/green_patents_count.csv")

In [None]:
print("forward.columns: ",forward.columns)
print("count.columns: ",count.columns)

forward.columns:  Index(['gvkey_numeric', 'filing_year', 'total_5yr_forward_citations',
       'within_5_years'],
      dtype='object')
count.columns:  Index(['gvkey_numeric', 'filing_year', 'patents_count'], dtype='object')


In [None]:
forward_count = pd.merge(forward, count, how='left', on=['gvkey_numeric', 'filing_year'])
forward_count.to_csv('/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/total_5yr_forward_citations_v2_green_current_count.csv', index=False)

## Merge to get ticker

In [None]:
import pandas as pd

In [None]:
forward_count = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/total_5yr_forward_citations_current_count.csv")
mapping = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/permno_cusip_ticker_permco_gvkey_without_na.csv")

In [None]:
print(forward_count.columns)
print(mapping.columns)

Index(['gvkey_numeric', 'filing_year', 'total_5yr_forward_citations',
       'within_5_years', 'patents_count'],
      dtype='object')
Index(['PERMNO', 'FSYM_ID', 'FSYM_ID_KIND', 'PROPER_NAME', 'FSYM_REGIONAL_ID',
       'FSYM_SECURITY_ID', 'FS_PERM_SEC_ID', 'FACTSET_ENTITY_ID',
       'ENTITY_PROPER_NAME', 'CUSIP_FS', 'TICKER_EXCHANGE', 'PERMCO', 'CUSIP',
       'NCUSIP', 'TICKER', 'COMNAM', 'LINK_BDATE', 'LINK_EDATE', 'COMPANYID',
       'GVKEY'],
      dtype='object')


In [None]:
forward_count_ticker = pd.merge(forward_count, mapping[['GVKEY', 'PERMNO', 'CUSIP', 'TICKER']], how='left', left_on='gvkey_numeric', right_on='GVKEY')

In [None]:
forward_count_ticker_dropna = forward_count_ticker.dropna()
forward_count_ticker_dropna.to_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/total_5yr_forward_citations_current_count_ticker.csv", index=False)

In [None]:
report_ticker = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/company_name/company_name_ticker.csv")

In [None]:
forward_count_ticker_dropna = forward_count_ticker_dropna[forward_count_ticker_dropna['TICKER'].isin(report_ticker['ticker'].values)]
forward_count_ticker_dropna.to_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/total_5yr_forward_citations_current_count_ticker_within_report.csv", index=False)

# Merge the ticker in mapping to gp_values by permno

In [1]:
import pandas as pd

In [None]:
gp_value = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/green_patents_values_filing.csv")
mapping = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/original_dataset/PERMNO_CUSIP_TICKER_PERMCO.csv")
csr_ticker_name = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/csr_ticker_name.csv")

In [68]:
from difflib import SequenceMatcher
import pandas as pd

def merge_and_match_best_company(gp_value, mapping, csr_ticker_name):
    """
    將 gp_value 根據 mapping 對應出 TICKER 與 COMNAM，
    再根據 TICKER 對應到 csr_ticker_name 的 COMPANY，
    並保留每筆 (permno, year) 最相似的 COMPANY 一筆。

    Parameters:
        gp_value (DataFrame): 包含 permno, year 等欄位
        mapping (DataFrame): 包含 PERMNO, TICKER, COMNAM
        csr_ticker_name (DataFrame): 包含 TICKER, COMPANY

    Returns:
        DataFrame: 合併後只保留最相似 COMPANY 的資料
    """
    
    # Step 1: 合併 gp_value 與 mapping（PERMNO -> TICKER, COMNAM）
    merged = pd.merge(
        gp_value, 
        mapping[['PERMNO', 'TICKER', 'COMNAM']], 
        how='left', 
        left_on='permno', 
        right_on='PERMNO'
    )
    merged['year'] = merged['year'].astype('int')

    # Step 2: 合併 TICKER -> COMPANY
    merged = pd.merge(
        merged, 
        csr_ticker_name[['ticker', 'COMPANY']], 
        how='left', 
        left_on='TICKER',
        right_on='ticker'
    )

    # Step 3: 對每組 permno-year 找出最相似的 COMPANY（比 COMNAM）
    def get_best_match(group):
        if group.shape[0] == 1:
            return group
        comnam = str(group['COMNAM'].iloc[0])
        group['similarity'] = group['COMPANY'].apply(lambda x: SequenceMatcher(None, comnam, str(x)).ratio())
        return group.loc[[group['similarity'].idxmax()]]

    best_match = merged.groupby(['permno', 'year'], group_keys=False).apply(get_best_match)

    # Step 4: 移除 similarity 欄（如不需要）
    best_match = best_match.drop(columns=['similarity'], errors='ignore')

    return best_match


In [69]:
best_match = merge_and_match_best_company(gp_value, mapping, csr_ticker_name)

  best_match = merged.groupby(['permno', 'year'], group_keys=False).apply(get_best_match)


In [None]:
best_match.to_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/green_patent_values_best_match.csv", index=False)

In [16]:
import pandas as pd
best_match = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/green_patent_values_best_match.csv")
report_ticker = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/company_name/company_name_ticker.csv")

In [17]:
best_match = best_match[best_match['TICKER'].isin(report_ticker['ticker'].values)]
best_match.to_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/green_patent_values_best_match_within_report.csv", index=False)

# Delete report with year2 (overlapping)

In [48]:
import pandas as pd

In [49]:
report_all = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_all.csv")

In [50]:
report_all_without_overlap = report_all[report_all['year2'].isna()]
report_all_without_overlap.drop(columns=['year2'], inplace=True)
report_all_without_overlap.to_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_all_without_overlap.csv", index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  report_all_without_overlap.drop(columns=['year2'], inplace=True)


# Merge report and patent

## Merge report and forward, count

In [53]:
import pandas as pd
import numpy as np

In [54]:
report = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_company_without_overlap.csv")
forward_count_ticker = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/total_5yr_forward_citations_current_count_ticker.csv")

In [55]:
report['ticker'] = report['ticker'].str.upper()
report

Unnamed: 0,folder_name,exchange_x,ticker,year,company_name,exchange_y
0,NASDAQ_AAL_2007,NASDAQ,AAL,2007,American Airlines Group,NAS
1,NASDAQ_AAL_2008,NASDAQ,AAL,2008,American Airlines Group,NAS
2,NASDAQ_AAL_2009,NASDAQ,AAL,2009,American Airlines Group,NAS
3,NASDAQ_AAL_2011,NASDAQ,AAL,2011,American Airlines Group,NAS
4,NASDAQ_AAL_2012,NASDAQ,AAL,2012,American Airlines Group,NAS
...,...,...,...,...,...,...
6053,NYSE_hpq_2017,NYSE,HPQ,2017,HP Inc,NYS
6054,NYSE_hpq_2018,NYSE,HPQ,2018,HP Inc,NYS
6055,NYSE_hpq_2019,NYSE,HPQ,2019,HP Inc,NYS
6056,NYSE_hpq_2020,NYSE,HPQ,2020,HP Inc,NYS


In [56]:
forward_count_ticker = forward_count_ticker.drop_duplicates()
forward_count_ticker['TICKER'] = forward_count_ticker['TICKER'].str.upper()
forward_count_ticker

Unnamed: 0,gvkey_numeric,filing_year,total_5yr_forward_citations,within_5_years,patents_count,GVKEY,PERMNO,CUSIP,TICKER
0,4839,2004,0,0.0,3,4839.0,25785.0,34537086,F
1,4839,2005,0,0.0,6,4839.0,25785.0,34537086,F
2,4839,2006,4,2.0,5,4839.0,25785.0,34537086,F
3,4839,2007,280,24.0,28,4839.0,25785.0,34537086,F
4,4839,2008,358,27.0,34,4839.0,25785.0,34537086,F
...,...,...,...,...,...,...,...,...,...
8760,23051,2014,3,1.0,1,23051.0,76393.0,60474510,MIRR
8761,65944,2011,0,0.0,1,65944.0,85686.0,04546C20,DPMD
8762,65944,2011,0,0.0,1,65944.0,85686.0,04546C20,DMI
8763,65944,2011,0,0.0,1,65944.0,85686.0,04546C20,DEPO


In [57]:
# 標準化 ticker 為大寫（確保一致）
report['ticker'] = report['ticker'].str.upper()
forward_count_ticker['TICKER'] = forward_count_ticker['TICKER'].str.upper()

# 取交集
common_tickers = set(report['ticker']).intersection(set(forward_count_ticker['TICKER']))

# 轉成 DataFrame 方便查看
common_ticker_df = pd.DataFrame(sorted(common_tickers), columns=['TICKER'])

# 顯示結果
print(f"✅ 共有 {len(common_ticker_df)} 個 ticker 同時出現在 report 和 forward_count_ticker 中")
print(common_ticker_df.head())


✅ 共有 392 個 ticker 同時出現在 report 和 forward_count_ticker 中
  TICKER
0      A
1     AA
2   AAPL
3   ABBV
4    ABT


In [58]:
# Step 1: 先合併（與原始邏輯一致）
report_forward_count = pd.merge(
    report,
    forward_count_ticker[['TICKER', 'filing_year', 'total_5yr_forward_citations', 'within_5_years', 'patents_count']],
    how='left',
    left_on=['ticker', 'year'],  # 根據你的欄位名稱應該是 year1
    right_on=['TICKER', 'filing_year']
)

# Step 2: 建立一個 ticker 的交集集合
common_tickers = set(report['ticker']).intersection(set(forward_count_ticker['TICKER']))

# Step 3: 建立條件：在交集內的 ticker，但數值為 NaN（表示年份沒對上）
mask = (
    report_forward_count['ticker'].isin(common_tickers) &
    report_forward_count['total_5yr_forward_citations'].isna()
)

# Step 4: 對這些條件補上 0
cols_to_fill = ['total_5yr_forward_citations', 'within_5_years', 'patents_count']
report_forward_count.loc[mask, cols_to_fill] = 0

# Step 5: 此時只補了你要補的，其他 NaN 不動，接著轉型時只針對非 NaN 行
for col in cols_to_fill:
    report_forward_count[col] = report_forward_count[col].apply(lambda x: int(x) if pd.notna(x) else np.nan)

print("✅ 已補上 forward 缺年份的 0，其他保持 NaN")

✅ 已補上 forward 缺年份的 0，其他保持 NaN


In [59]:
report_forward_count

Unnamed: 0,folder_name,exchange_x,ticker,year,company_name,exchange_y,TICKER,filing_year,total_5yr_forward_citations,within_5_years,patents_count
0,NASDAQ_AAL_2007,NASDAQ,AAL,2007,American Airlines Group,NAS,,,,,
1,NASDAQ_AAL_2008,NASDAQ,AAL,2008,American Airlines Group,NAS,,,,,
2,NASDAQ_AAL_2009,NASDAQ,AAL,2009,American Airlines Group,NAS,,,,,
3,NASDAQ_AAL_2011,NASDAQ,AAL,2011,American Airlines Group,NAS,,,,,
4,NASDAQ_AAL_2012,NASDAQ,AAL,2012,American Airlines Group,NAS,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
6105,NYSE_hpq_2017,NYSE,HPQ,2017,HP Inc,NYS,,,,,
6106,NYSE_hpq_2018,NYSE,HPQ,2018,HP Inc,NYS,,,,,
6107,NYSE_hpq_2019,NYSE,HPQ,2019,HP Inc,NYS,,,,,
6108,NYSE_hpq_2020,NYSE,HPQ,2020,HP Inc,NYS,,,,,


In [60]:
# Step 1: 合併 report 和 forward 資料
report_forward_count_status = pd.merge(
    report,
    forward_count_ticker[['TICKER', 'filing_year', 'total_5yr_forward_citations', 'within_5_years', 'patents_count']],
    how='left',
    left_on=['ticker', 'year'],
    right_on=['TICKER', 'filing_year']
)

# Step 2: 定義 ticker 的交集
common_tickers = set(report['ticker']).intersection(set(forward_count_ticker['TICKER']))

# Step 3: 建立 match_status 欄位
def classify_match(row):
    if pd.notna(row['total_5yr_forward_citations']):
        return 'full_match'
    elif row['ticker'] in common_tickers:
        return 'missing_year'
    else:
        return 'ticker_not_found'

report_forward_count_status['match_status'] = report_forward_count_status.apply(classify_match, axis=1)

# Step 4: 僅對 missing_year 的資料補 0
cols_to_fill = ['total_5yr_forward_citations', 'within_5_years', 'patents_count']
report_forward_count_status.loc[report_forward_count_status['match_status'] == 'missing_year', cols_to_fill] = 0

# Step 5: 將補過的值轉 int，其餘仍為 NaN
for col in cols_to_fill:
    report_forward_count_status[col] = report_forward_count_status[col].apply(lambda x: int(x) if pd.notna(x) else np.nan)

# Done!
print("✅ match_status 欄位建立完成，forward 資訊依狀況正確補值")
print(report_forward_count_status[['ticker', 'year', 'match_status']].value_counts())


✅ match_status 欄位建立完成，forward 資訊依狀況正確補值
ticker  year  match_status    
APTV    2019  missing_year        5
        2018  full_match          5
        2021  missing_year        5
        2022  missing_year        5
        2020  missing_year        5
                                 ..
EXR     2020  ticker_not_found    1
        2019  ticker_not_found    1
EXPD    2021  ticker_not_found    1
        2020  ticker_not_found    1
F       2018  full_match          1
Name: count, Length: 5909, dtype: int64


In [63]:
# print ticker_not_found
ticker_not_found = report_forward_count_status[report_forward_count_status['match_status'] == 'ticker_not_found']
ticker_not_found

Unnamed: 0,folder_name,exchange_x,ticker,year,company_name,exchange_y,TICKER,filing_year,total_5yr_forward_citations,within_5_years,patents_count,match_status
0,NASDAQ_AAL_2007,NASDAQ,AAL,2007,American Airlines Group,NAS,,,,,,ticker_not_found
1,NASDAQ_AAL_2008,NASDAQ,AAL,2008,American Airlines Group,NAS,,,,,,ticker_not_found
2,NASDAQ_AAL_2009,NASDAQ,AAL,2009,American Airlines Group,NAS,,,,,,ticker_not_found
3,NASDAQ_AAL_2011,NASDAQ,AAL,2011,American Airlines Group,NAS,,,,,,ticker_not_found
4,NASDAQ_AAL_2012,NASDAQ,AAL,2012,American Airlines Group,NAS,,,,,,ticker_not_found
...,...,...,...,...,...,...,...,...,...,...,...,...
6105,NYSE_hpq_2017,NYSE,HPQ,2017,HP Inc,NYS,,,,,,ticker_not_found
6106,NYSE_hpq_2018,NYSE,HPQ,2018,HP Inc,NYS,,,,,,ticker_not_found
6107,NYSE_hpq_2019,NYSE,HPQ,2019,HP Inc,NYS,,,,,,ticker_not_found
6108,NYSE_hpq_2020,NYSE,HPQ,2020,HP Inc,NYS,,,,,,ticker_not_found


In [64]:
report_forward_count_dropna = report_forward_count.dropna(subset=['total_5yr_forward_citations'])
report_forward_count_dropna.drop(columns=['filing_year', 'exchange_y', 'TICKER'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  report_forward_count_dropna.drop(columns=['filing_year', 'exchange_y', 'TICKER'], inplace=True)


In [66]:
report_forward_count_dropna.to_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_forward_count.csv", index=False)

## Count how many count company with report

In [67]:
import pandas as pd

In [68]:
forward_count_within = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/total_5yr_forward_citations_current_count_ticker_within_report.csv")

In [69]:
len(forward_count_within['TICKER'].unique())

602

## forward_count_ticker 中有哪些 TICKER ，在 report 中沒有出現過

In [10]:
import pandas as pd

In [13]:
report_all = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_all.csv")
forward_count_ticker = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/total_5yr_forward_citations_current_count_ticker.csv")

In [15]:
forward_count_ticker['TICKER'].unique()

array(['F', 'QCOM', 'RHR', ..., 'DMI', 'DEPO', 'FITB'],
      shape=(2071,), dtype=object)

In [16]:
report_all['ticker'] = report_all['ticker'].str.upper()
report_all['ticker'].unique()

array(['AAL', 'AAON', 'AAPL', ..., 'ZMH', 'ZNH', 'HPQ'],
      shape=(1134,), dtype=object)

In [17]:
# 標準化 ticker 為大寫（確保一致）
report_all['ticker'] = report_all['ticker'].str.upper()
forward_count_ticker['TICKER'] = forward_count_ticker['TICKER'].str.upper()

# 取得各自唯一的 ticker 集合
report_all_tickers = set(report_all['ticker'].unique())
forward_tickers = set(forward_count_ticker['TICKER'].unique())

# 找出只存在於 forward_count_ticker 中的 ticker
missing_tickers = forward_tickers - report_all_tickers

# 轉成 DataFrame 顯示
missing_ticker_df = pd.DataFrame(sorted(missing_tickers), columns=['TICKER'])

# 顯示結果
print(f"✅ 有 {len(missing_ticker_df)} 個 ticker 僅出現在 forward_count_ticker 中")
print(missing_ticker_df.head())


✅ 有 1667 個 ticker 僅出現在 forward_count_ticker 中
  TICKER
0   AABA
1   AAII
2   AAOI
3    AAS
4   AATI


In [None]:
# # Step 1: 把 report 的 ticker 標準化成大寫
# report['ticker'] = report['ticker'].str.upper()

# # Step 2: 只取出報告中有的 (ticker, year)
# report_pairs = report[['ticker', 'year']].drop_duplicates().rename(
#     columns={'ticker': 'TICKER', 'year': 'filing_year'}
# )

# # Step 3: 用 outer merge 加上 indicator 來判斷來源
# merged = forward_count_ticker[['TICKER', 'filing_year']].drop_duplicates().merge(
#     report_pairs,
#     on=['TICKER', 'filing_year'],
#     how='left',
#     indicator=True
# )

# # Step 4: 篩選 _merge == 'left_only'，代表只有在 forward_count_ticker 中出現
# missing_in_report = merged[merged['_merge'] == 'left_only'][['TICKER', 'filing_year']].reset_index(drop=True)

# # 顯示結果
# print(f"✅ 有 {len(missing_in_report)} 筆 TICKER + year 組合在 forward_count_ticker 中但不在 report 中")
# print(missing_in_report.head())
# missing_in_report

✅ 有 7786 筆 TICKER + year 組合在 forward_count_ticker 中但不在 report 中
  TICKER  filing_year
0      F         2004
1      F         2005
2      F         2006
3      F         2007
4      F         2008


Unnamed: 0,TICKER,filing_year
0,F,2004
1,F,2005
2,F,2006
3,F,2007
4,F,2008
...,...,...
7781,MIRR,2014
7782,DPMD,2011
7783,DMI,2011
7784,DEPO,2011


In [70]:
import pandas as pd

In [71]:
report_count = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_forward_count.csv")

In [72]:
len(report_count['ticker'].unique())

392

In [73]:
report_count['year'].min()

np.int64(1998)

In [74]:
report_count['year'].max()

np.int64(2022)

In [75]:
report_count

Unnamed: 0,folder_name,exchange_x,ticker,year,company_name,total_5yr_forward_citations,within_5_years,patents_count
0,NASDAQ_AAPL_2014,NASDAQ,AAPL,2014,Apple Inc.,910.0,121.0,122.0
1,NASDAQ_AAPL_2015,NASDAQ,AAPL,2015,Apple Inc.,892.0,120.0,123.0
2,NASDAQ_AAPL_2016,NASDAQ,AAPL,2016,Apple Inc.,801.0,93.0,96.0
3,NASDAQ_AAPL_2017,NASDAQ,AAPL,2017,Apple Inc.,381.0,86.0,88.0
4,NASDAQ_AAPL_2018,NASDAQ,AAPL,2018,Apple Inc.,144.0,26.0,29.0
...,...,...,...,...,...,...,...,...
2546,NYSE_ZMH_2017,NYSE,ZMH,2017,Zimmer Holdings Inc.,0.0,0.0,0.0
2547,NYSE_ZMH_2018,NYSE,ZMH,2018,Zimmer Holdings Inc.,0.0,0.0,0.0
2548,NYSE_ZMH_2019,NYSE,ZMH,2019,Zimmer Holdings Inc.,0.0,0.0,0.0
2549,NYSE_ZMH_2020,NYSE,ZMH,2020,Zimmer Holdings Inc.,0.0,0.0,0.0


In [78]:
# 計算每家企業的資料筆數
company_counts = report_count['company_name'].value_counts()

# 計算平均
average_reports_per_company = company_counts.mean()

print(f"✅ 每家企業平均有 {average_reports_per_company:.2f} 筆資料")
print(company_counts.describe())

✅ 每家企業平均有 6.32 筆資料
count    396.000000
mean       6.315657
std        4.526586
min        1.000000
25%        3.000000
50%        5.000000
75%        8.000000
max       25.000000
Name: count, dtype: float64


In [79]:
# Step 1: 找出三個欄位全為 0 的公司名稱
all_zero_companies = report_count.groupby('company_name')[['total_5yr_forward_citations', 'within_5_years', 'patents_count']] \
    .apply(lambda g: ((g == 0) | (g.isna())).all().all())  # 檢查每一列、每個欄是否全為 0 或 NaN

# 篩出需移除的公司名稱
companies_to_remove = all_zero_companies[all_zero_companies].index

# Step 2: 將這些公司從原資料刪除
filtered_report_count = report_count[~report_count['company_name'].isin(companies_to_remove)]

# Step 3: 印出刪除前後差異
print(f"📊 原本共有 {len(report_count)} 筆資料")
print(f"❌ 移除掉 {len(report_count) - len(filtered_report_count)} 筆（來自 {len(companies_to_remove)} 家所有數值皆為 0 的公司）")
print(f"✅ 現在剩下 {len(filtered_report_count)} 筆資料")


📊 原本共有 2551 筆資料
❌ 移除掉 852 筆（來自 216 家所有數值皆為 0 的公司）
✅ 現在剩下 1699 筆資料


In [81]:
len(filtered_report_count['ticker'].unique())

181

In [90]:
company_counts_filtered = filtered_report_count['company_name'].value_counts()
company_counts_filtered.mean()

np.float64(9.161111111111111)

In [91]:
filtered_report_count

Unnamed: 0,folder_name,exchange_x,ticker,year,company_name,total_5yr_forward_citations,within_5_years,patents_count
0,NASDAQ_AAPL_2014,NASDAQ,AAPL,2014,Apple Inc.,910.0,121.0,122.0
1,NASDAQ_AAPL_2015,NASDAQ,AAPL,2015,Apple Inc.,892.0,120.0,123.0
2,NASDAQ_AAPL_2016,NASDAQ,AAPL,2016,Apple Inc.,801.0,93.0,96.0
3,NASDAQ_AAPL_2017,NASDAQ,AAPL,2017,Apple Inc.,381.0,86.0,88.0
4,NASDAQ_AAPL_2018,NASDAQ,AAPL,2018,Apple Inc.,144.0,26.0,29.0
...,...,...,...,...,...,...,...,...
2524,NYSE_XRX_2016,NYSE,XRX,2016,Xerox Corp,56.0,6.0,6.0
2525,NYSE_XRX_2017,NYSE,XRX,2017,Xerox Corp,0.0,0.0,0.0
2526,NYSE_XRX_2018,NYSE,XRX,2018,Xerox Corp,0.0,0.0,0.0
2527,NYSE_XRX_2019,NYSE,XRX,2019,Xerox Corp,0.0,0.0,0.0


# Merge embedding and patent

In [13]:
import pandas as pd

In [14]:
report_forward_count = pd.read_csv("/opt/hdd_1/research_hub/csr_project/CSR Reporting/Dataset/report_forward_count.csv")
csr_embeddings = pd.read_csv("/opt/hdd_1/research_hub/csr_project/output_dataset/csr_embeddings.csv")

In [19]:
csr_embeddings_forward_count = pd.merge(
    report_forward_count[['folder_name', 'ticker','total_5yr_forward_citations','within_5_years','patents_count']],
    csr_embeddings,
    how='left',
    left_on=['folder_name'],
    right_on=['file_name']
)
csr_embeddings_forward_count.drop(columns=['file_name'], inplace=True)

In [20]:
csr_embeddings_forward_count

Unnamed: 0,folder_name,ticker,total_5yr_forward_citations,within_5_years,patents_count,dim_0,dim_1,dim_2,dim_3,dim_4,...,dim_1014,dim_1015,dim_1016,dim_1017,dim_1018,dim_1019,dim_1020,dim_1021,dim_1022,dim_1023
0,NASDAQ_AAPL_2014,AAPL,910.0,121.0,122.0,-0.091830,0.293040,-0.179719,0.083607,-0.032453,...,-0.086017,-0.403820,0.282563,0.331965,0.290067,-0.223406,-0.325722,0.009091,-0.170071,0.163041
1,NASDAQ_AAPL_2015,AAPL,892.0,120.0,123.0,-0.085034,0.276596,-0.139275,0.068203,-0.015527,...,-0.090395,-0.406836,0.278167,0.320248,0.292318,-0.212523,-0.331629,-0.030166,-0.150414,0.180043
2,NASDAQ_AAPL_2016,AAPL,801.0,93.0,96.0,-0.067159,0.277524,-0.148442,0.092723,-0.000616,...,-0.109939,-0.416270,0.282748,0.334449,0.296767,-0.216122,-0.327691,-0.037364,-0.151454,0.178229
3,NASDAQ_AAPL_2017,AAPL,381.0,86.0,88.0,-0.053141,0.259755,-0.160064,0.098964,-0.006857,...,-0.110439,-0.411443,0.307153,0.298774,0.284501,-0.213012,-0.317366,-0.034005,-0.147262,0.175490
4,NASDAQ_AAPL_2018,AAPL,144.0,26.0,29.0,-0.070878,0.338528,-0.095608,0.093072,-0.027314,...,-0.099431,-0.410019,0.305583,0.368817,0.274357,-0.190008,-0.347579,-0.004540,-0.125971,0.124562
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2546,NYSE_ZMH_2017,ZMH,0.0,0.0,0.0,-0.169942,0.230924,-0.135710,-0.012833,-0.120696,...,-0.040701,-0.431815,0.291343,0.305337,0.356416,-0.140775,-0.306586,0.066598,-0.164039,0.153624
2547,NYSE_ZMH_2018,ZMH,0.0,0.0,0.0,-0.167569,0.257492,-0.142169,-0.026459,-0.135864,...,-0.072033,-0.459188,0.316238,0.344269,0.328685,-0.131830,-0.339339,0.074575,-0.160590,0.183089
2548,NYSE_ZMH_2019,ZMH,0.0,0.0,0.0,-0.154448,0.269216,-0.075729,-0.069002,-0.101010,...,0.015818,-0.427113,0.272172,0.297398,0.369410,-0.080663,-0.288597,-0.001568,-0.170422,0.153482
2549,NYSE_ZMH_2020,ZMH,0.0,0.0,0.0,-0.102546,0.298755,-0.095311,-0.073182,-0.098922,...,0.000124,-0.377405,0.339362,0.301602,0.366468,-0.075585,-0.339080,0.050823,-0.134400,0.196993


In [21]:
csr_embeddings_forward_count.to_csv("/opt/hdd_1/research_hub/csr_project/green_innovation_predcition/data/csr_embeddings_forward_count.csv", index=False)