# Corporate Resilience Project - Automated Matching (FIXED)
## Notebook 2: Match TRI Facilities to CRSP Companies


In [None]:
# Setup
!pip install fuzzywuzzy python-Levenshtein -q
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries loaded")
print(f"Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m46.5 MB/s[0m eta [36m0:00:00[0m
[?25hMounted at /content/drive
✓ Libraries loaded
Started: 2025-12-04 13:19:18


## 1. Load Fixed Data

In [None]:
# Paths
BASE_PATH = Path('/content/drive/MyDrive/Paper1_Dataset')
PROCESSED_PATH = BASE_PATH / 'processed'

print("Loading processed data...")
print("="*80)

# Load TRI
tri_data = pd.read_parquet(PROCESSED_PATH / 'tri_facilities_panel.parquet')
print(f"✓ TRI loaded: {len(tri_data):,} records")
print(f"  Unique facilities: {tri_data['TRIFD'].nunique():,}")
print(f"  Unique companies: {tri_data['COMPANY_NAME_STD'].nunique():,}")
print(f"  With parent company: {tri_data['PARENT_COMPANY'].notna().sum():,} ({tri_data['PARENT_COMPANY'].notna().sum()/len(tri_data)*100:.1f}%)")

# Load CRSP
crsp_data = pd.read_parquet(PROCESSED_PATH / 'crsp_companies.parquet')
print(f"\n✓ CRSP loaded: {len(crsp_data):,} records")
print(f"  Unique companies: {crsp_data['COMNAM_STD'].nunique():,}")

print("\n" + "="*80)
print("✓ Data ready for matching!")
print("\nSample TRI companies:")
print(tri_data['COMPANY_NAME_STD'].drop_duplicates().head(10).tolist())
print("\nSample CRSP companies:")
print(crsp_data['COMNAM_STD'].drop_duplicates().head(10).tolist())

Loading processed data...
✓ TRI loaded: 1,148,673 records
  Unique facilities: 29,176
  Unique companies: 15,834
  With parent company: 815,002 (71.0%)

✓ CRSP loaded: 38,872 records
  Unique companies: 32,675

✓ Data ready for matching!

Sample TRI companies:
['OHIO CASTINGS', 'SOUTHEASTERN EXTRUSION AND TOOL', 'POLYTEK DEVELOPMENT', 'ALLEGAN METAL FINISHING', 'BASF', 'TITAN AMERICA', 'CMC STEEL FABRICATORS DBA CMC CAPITOL CITY STEEL', 'SURTECO NA JEANNETTE PA PLANT', 'EXXON MOBIL', 'OSHKOSH']

Sample CRSP companies:
['OPTIMUM MANUFACTURING', 'GAS NATURAL', 'BANCTRUST FINANCIAL GROUP', 'GREAT COUNTRY BK ASONIA CT', 'CLOSE OUTS PLUS', 'WESTERN ENERGY RESOURCES', 'A C F INDUSTRIES', 'SHAREDATA', 'GARDENAMERICA', 'IROQUOIS BANCORP']


## 2. County FIPS Crosswalk

In [None]:
print("\n" + "="*80)
print("SECTION 2: VERIFY FIPS CODES")
print("="*80)

# Check if FIPS already exists from Notebook 1
if 'FIPS' in tri_data.columns:
    fips_count = tri_data['FIPS'].notna().sum()
    fips_rate = fips_count / len(tri_data) * 100

    print(f"\n✓ FIPS ALREADY PRESENT FROM NOTEBOOK 1:")
    print(f"  Total records: {len(tri_data):,}")
    print(f"  With FIPS: {fips_count:,}")
    print(f"  Match rate: {fips_rate:.1f}%")

    if fips_rate >= 70:
        print(f"  ✓ EXCELLENT - Using existing FIPS codes!")
    elif fips_rate >= 50:
        print(f"  ✓ GOOD - Using existing FIPS codes")
    else:
        print(f"  ⚠️  LOW match rate from Notebook 1")

    print(f"\n  Sample FIPS codes:")
    sample = tri_data[tri_data['FIPS'].notna()][['STATE_ABBR', 'FACILITY_COUNTY', 'FIPS']].drop_duplicates().head(10)
    print(sample.to_string(index=False))

    print(f"\n  Skipping FIPS creation - using Notebook 1 output.")

else:
    print(f"\n⚠️  FIPS NOT FOUND IN TRI DATA")
    print(f"   This should not happen if Notebook 1 ran successfully.")
    print(f"   Creating FIPS now as fallback...\n")

    import urllib.request

    url = "https://www2.census.gov/geo/docs/reference/codes/files/national_county.txt"

    try:
        county_data = pd.read_csv(url, encoding='latin-1',
                                  names=['STATE_ABBR', 'STATE_FIPS', 'COUNTY_FIPS', 'COUNTY_NAME', 'CLASS'])

        county_data['FIPS'] = (county_data['STATE_FIPS'].astype(str).str.zfill(2) +
                               county_data['COUNTY_FIPS'].astype(str).str.zfill(3))

        # Improved standardization (same as Notebook 1)
        def standardize_county(name):
            if pd.isna(name) or name == '':
                return ''
            name = str(name).upper().strip()
            name = name.replace('ST.', 'SAINT').replace('ST ', 'SAINT ')
            name = name.replace('MT.', 'MOUNT').replace('FT.', 'FORT')
            for char in [' ', '.', '-', "'", ',']:
                name = name.replace(char, '')
            for suffix in ['COUNTY', 'PARISH', 'BOROUGH']:
                if name.endswith(suffix):
                    name = name[:-len(suffix)]
            return name.strip()

        tri_data['FACILITY_COUNTY_STD'] = tri_data['FACILITY_COUNTY'].apply(standardize_county)
        county_data['COUNTY_NAME_STD'] = county_data['COUNTY_NAME'].apply(standardize_county)

        # Merge
        tri_data = tri_data.merge(
            county_data[['STATE_ABBR', 'COUNTY_NAME_STD', 'FIPS']],
            left_on=['STATE_ABBR', 'FACILITY_COUNTY_STD'],
            right_on=['STATE_ABBR', 'COUNTY_NAME_STD'],
            how='left'
        )

        matched = tri_data['FIPS'].notna().sum()
        print(f"✓ FIPS Created: {matched:,} / {len(tri_data):,} ({matched/len(tri_data)*100:.1f}%)")

    except Exception as e:
        print(f"✗ Error creating FIPS: {e}")

print("\n" + "="*80)



SECTION 2: VERIFY FIPS CODES

✓ FIPS ALREADY PRESENT FROM NOTEBOOK 1:
  Total records: 1,148,673
  With FIPS: 1,136,098
  Match rate: 98.9%
  ✓ EXCELLENT - Using existing FIPS codes!

  Sample FIPS codes:
STATE_ABBR         FACILITY_COUNTY  FIPS
        OH                   STARK 39151
        AL              LAUDERDALE 01077
        PA             NORTHAMPTON 42095
        MI                 ALLEGAN 26005
        CA               RIVERSIDE 06065
        FL                 BREVARD 12009
        TX                    HAYS 48209
        PA            WESTMORELAND 42129
        LA EAST BATON ROUGE PARISH 22033
        IA                  HOWARD 19089

  Skipping FIPS creation - using Notebook 1 output.



## 3. Prepare Matching Datasets

In [None]:
print("\n" + "="*80)
print("PREPARING MATCHING DATASETS")
print("="*80)

# Create unique TRI companies
tri_companies = tri_data[['COMPANY_NAME', 'COMPANY_NAME_STD', 'STATE_ABBR']].drop_duplicates()
tri_companies = tri_companies[tri_companies['COMPANY_NAME_STD'].notna() & (tri_companies['COMPANY_NAME_STD'] != '')]
tri_companies['TRI_COMPANY_ID'] = range(len(tri_companies))

print(f"\nTRI Companies:")
print(f"  Unique companies: {len(tri_companies):,}")
print(f"  With state info: {tri_companies['STATE_ABBR'].notna().sum():,}")

# Create unique CRSP companies
crsp_companies = crsp_data[['COMNAM', 'COMNAM_STD', 'PERMNO', 'TICKER', 'CUSIP9']].drop_duplicates(subset=['COMNAM_STD'])
crsp_companies = crsp_companies[crsp_companies['COMNAM_STD'].notna() & (crsp_companies['COMNAM_STD'] != '')]

print(f"\nCRSP Companies:")
print(f"  Unique companies: {len(crsp_companies):,}")
print(f"  With CUSIP: {crsp_companies['CUSIP9'].notna().sum():,}")

print("\n" + "="*80)


PREPARING MATCHING DATASETS

TRI Companies:
  Unique companies: 24,303
  With state info: 24,303

CRSP Companies:
  Unique companies: 32,675
  With CUSIP: 32,675



## 4. Stage 1: Exact Matching

In [None]:
print("\n" + "="*80)
print("STAGE 1: EXACT NAME MATCHING")
print("="*80)

# Merge on standardized names
exact_matches = tri_companies.merge(
    crsp_companies,
    left_on='COMPANY_NAME_STD',
    right_on='COMNAM_STD',
    how='inner'
)

exact_matches['MATCH_SCORE'] = 100
exact_matches['MATCH_TYPE'] = 'EXACT'

print(f"\n✓ Exact matches: {len(exact_matches):,}")
print(f"  % of TRI companies: {len(exact_matches)/len(tri_companies)*100:.1f}%")

print("\nSample exact matches:")
print(exact_matches[['COMPANY_NAME', 'COMNAM', 'MATCH_SCORE']].head(20).to_string(index=False))
print("="*80)
print("EMERGENCY FIX: Resetting CRSP companies index")
print("="*80)

# Reset the index to fix the IndexError
crsp_companies = crsp_companies.reset_index(drop=True)

# Rebuild the lookup with correct indices
crsp_names = crsp_companies['COMNAM_STD'].tolist()
crsp_lookup = dict(zip(crsp_companies['COMNAM_STD'], crsp_companies.index))

print(f"✓ CRSP companies: {len(crsp_companies):,}")
print(f"✓ Index range: {crsp_companies.index.min()} to {crsp_companies.index.max()}")
print(f"✓ Lookup entries: {len(crsp_lookup):,}")
print("="*80)


STAGE 1: EXACT NAME MATCHING

✓ Exact matches: 3,289
  % of TRI companies: 13.5%

Sample exact matches:
                    COMPANY_NAME                           COMNAM  MATCH_SCORE
                EXXON MOBIL CORP                 EXXON MOBIL CORP          100
                    OSHKOSH CORP                     OSHKOSH CORP          100
                      CUBIC CORP                       CUBIC CORP          100
       ARCHER DANIELS MIDLAND CO        ARCHER DANIELS MIDLAND CO          100
                     PEPSICO INC                      PEPSICO INC          100
            PARKER HANNIFIN CORP             PARKER HANNIFIN CORP          100
         INTERTAPE POLYMER GROUP      INTERTAPE POLYMER GROUP INC          100
     HONEYWELL INTERNATIONAL INC      HONEYWELL INTERNATIONAL INC          100
                    LA-Z-BOY INC                     LA Z BOY INC          100
               ASCENT INDUSTRIES             ASCENT INDUSTRIES CO          100
                   AMPHENO

## 5. Stage 2: High-Confidence Fuzzy Matching (≥90%)

In [None]:
print("\n" + "="*80)
print("STAGE 2: HIGH-CONFIDENCE FUZZY MATCHING (≥90%)")
print("="*80)

# Get unmatched TRI companies
if len(exact_matches) > 0:
    matched_tri_ids = set(exact_matches['TRI_COMPANY_ID'])
    unmatched_tri = tri_companies[~tri_companies['TRI_COMPANY_ID'].isin(matched_tri_ids)]
else:
    unmatched_tri = tri_companies.copy()

print(f"\nTRI companies to match: {len(unmatched_tri):,}")
print(f"CRSP companies available: {len(crsp_companies):,}")

# Create CRSP name list for fuzzy matching
crsp_names = crsp_companies['COMNAM_STD'].tolist()
crsp_lookup = dict(zip(crsp_companies['COMNAM_STD'], crsp_companies.index))

# Fuzzy match
print("\nRunning fuzzy matching (this may take several minutes)...")

fuzzy_matches = []
batch_size = 5000

for i in range(0, len(unmatched_tri), batch_size):
    batch = unmatched_tri.iloc[i:i+batch_size]
    print(f"  Processing {i:,} / {len(unmatched_tri):,}...")

    for _, tri_row in batch.iterrows():
        tri_name = tri_row['COMPANY_NAME_STD']
        if not tri_name or tri_name == '':
            continue

        # Get best match
        best_match = process.extractOne(tri_name, crsp_names, scorer=fuzz.token_sort_ratio)

        if best_match and best_match[1] >= 90:
            crsp_name = best_match[0]
            score = best_match[1]
            crsp_idx = crsp_lookup[crsp_name]
            crsp_row = crsp_companies.iloc[crsp_idx]

            fuzzy_matches.append({
                'TRI_COMPANY_ID': tri_row['TRI_COMPANY_ID'],
                'COMPANY_NAME': tri_row['COMPANY_NAME'],
                'COMPANY_NAME_STD': tri_name,
                'COMNAM': crsp_row['COMNAM'],
                'COMNAM_STD': crsp_name,
                'PERMNO': crsp_row['PERMNO'],
                'TICKER': crsp_row['TICKER'],
                'CUSIP9': crsp_row['CUSIP9'],
                'MATCH_SCORE': score,
                'MATCH_TYPE': 'FUZZY_HIGH'
            })

fuzzy_high_df = pd.DataFrame(fuzzy_matches)

print(f"\n✓ High-confidence fuzzy matches: {len(fuzzy_high_df):,}")
if len(fuzzy_high_df) > 0:
    print(f"  Average score: {fuzzy_high_df['MATCH_SCORE'].mean():.1f}")
    print("\nScore distribution:")
    print(f"  95-100: {(fuzzy_high_df['MATCH_SCORE']>=95).sum():,}")
    print(f"  90-94:  {((fuzzy_high_df['MATCH_SCORE']>=90) & (fuzzy_high_df['MATCH_SCORE']<95)).sum():,}")

    print("\nSample fuzzy matches:")
    print(fuzzy_high_df[['COMPANY_NAME_STD', 'COMNAM_STD', 'MATCH_SCORE']].head(20).to_string(index=False))


STAGE 2: HIGH-CONFIDENCE FUZZY MATCHING (≥90%)

TRI companies to match: 21,014
CRSP companies available: 32,675

Running fuzzy matching (this may take several minutes)...
  Processing 0 / 21,014...
  Processing 5,000 / 21,014...
  Processing 10,000 / 21,014...
  Processing 15,000 / 21,014...
  Processing 20,000 / 21,014...

✓ High-confidence fuzzy matches: 1,040
  Average score: 92.0

Score distribution:
  95-100: 125
  90-94:  915

Sample fuzzy matches:
                    COMPANY_NAME_STD                     COMNAM_STD  MATCH_SCORE
                  CUL MAC INDUSTRIES               C MAC INDUSTRIES           94
                      MCP INDUSTRIES                 P M INDUSTRIES           93
                ISOTEC INTERNATIONAL            ORTEC INTERNATIONAL           92
                   ESCO TECHNOLOGIES             SCOTT TECHNOLOGIES           91
                         XCEL ENERGY                   EXCEL ENERGY           96
                   REVERE INDUSTRIES             RESER

## 6. Combine High-Confidence Matches

In [None]:
print("\n" + "="*80)
print("COMBINING HIGH-CONFIDENCE MATCHES")
print("="*80)

# Combine
all_high_confidence = pd.concat([exact_matches, fuzzy_high_df], ignore_index=True)

print(f"\nTotal high-confidence matches: {len(all_high_confidence):,}")
print(f"  Exact matches: {len(exact_matches):,}")
print(f"  Fuzzy matches: {len(fuzzy_high_df):,}")
print(f"\nMatch rate: {len(all_high_confidence)/len(tri_companies)*100:.1f}% of TRI companies")

# Save
output_file = PROCESSED_PATH / 'matches_high_confidence.csv'
all_high_confidence.to_csv(output_file, index=False)
print(f"\n✓ Saved: {output_file}")


COMBINING HIGH-CONFIDENCE MATCHES

Total high-confidence matches: 4,329
  Exact matches: 3,289
  Fuzzy matches: 1,040

Match rate: 17.8% of TRI companies

✓ Saved: /content/drive/MyDrive/Paper1_Dataset/processed/matches_high_confidence.csv


## 7. Stage 3: Medium-Confidence Matches (70-89%)

In [None]:
print("\n" + "="*80)
print("STAGE 3: MEDIUM-CONFIDENCE MATCHING (70-89%)")
print("="*80)

# Get remaining unmatched
matched_tri_ids = set(all_high_confidence['TRI_COMPANY_ID'])
still_unmatched = tri_companies[~tri_companies['TRI_COMPANY_ID'].isin(matched_tri_ids)]

print(f"\nRemaining TRI companies: {len(still_unmatched):,}")
print("Searching for medium-confidence matches...")

medium_matches = []

for i in range(0, len(still_unmatched), batch_size):
    batch = still_unmatched.iloc[i:i+batch_size]
    print(f"  Processing {i:,} / {len(still_unmatched):,}...")

    for _, tri_row in batch.iterrows():
        tri_name = tri_row['COMPANY_NAME_STD']
        if not tri_name or tri_name == '':
            continue

        best_match = process.extractOne(tri_name, crsp_names, scorer=fuzz.token_sort_ratio)

        if best_match and 70 <= best_match[1] < 90:
            crsp_name = best_match[0]
            score = best_match[1]
            crsp_idx = crsp_lookup[crsp_name]
            crsp_row = crsp_companies.iloc[crsp_idx]

            medium_matches.append({
                'TRI_COMPANY_ID': tri_row['TRI_COMPANY_ID'],
                'COMPANY_NAME': tri_row['COMPANY_NAME'],
                'COMPANY_NAME_STD': tri_name,
                'COMNAM': crsp_row['COMNAM'],
                'COMNAM_STD': crsp_name,
                'PERMNO': crsp_row['PERMNO'],
                'TICKER': crsp_row['TICKER'],
                'CUSIP9': crsp_row['CUSIP9'],
                'MATCH_SCORE': score,
                'MATCH_TYPE': 'FUZZY_MEDIUM'
            })

medium_df = pd.DataFrame(medium_matches)

print(f"\n✓ Medium-confidence matches: {len(medium_df):,}")
if len(medium_df) > 0:
    print(f"  Average score: {medium_df['MATCH_SCORE'].mean():.1f}")
    print("\nSample (REQUIRE MANUAL REVIEW):")
    print(medium_df[['COMPANY_NAME_STD', 'COMNAM_STD', 'MATCH_SCORE']].head(20).to_string(index=False))

    # Save
    output_file = PROCESSED_PATH / 'matches_medium_confidence.csv'
    medium_df.to_csv(output_file, index=False)
    print(f"\n✓ Saved: {output_file}")


STAGE 3: MEDIUM-CONFIDENCE MATCHING (70-89%)

Remaining TRI companies: 19,974
Searching for medium-confidence matches...
  Processing 0 / 19,974...
  Processing 5,000 / 19,974...
  Processing 10,000 / 19,974...
  Processing 15,000 / 19,974...

✓ Medium-confidence matches: 12,054
  Average score: 78.0

Sample (REQUIRE MANUAL REVIEW):
           COMPANY_NAME_STD                        COMNAM_STD  MATCH_SCORE
              OHIO CASTINGS                 NATIONAL CASTINGS           73
        POLYTEK DEVELOPMENT                PLACER DEVELOPMENT           81
              TITAN AMERICA                     TITAN MEDICAL           85
                  3G MERMET                             MERET           71
                HF SINCLAIR                      H F SINCLAIR           87
          RIO TINTO AMERICA              BRITISH AMERICAN TOB           76
             SOLVAY HOLDING                   SOLUNA HOLDINGS           83
     WORTHINGTON INDUSTRIES                  ORION INDUSTRIES   

## 8. Identify Unmatched Companies

In [None]:
print("\n" + "="*80)
print("IDENTIFYING UNMATCHED COMPANIES")
print("="*80)

all_matched_ids = set(all_high_confidence['TRI_COMPANY_ID']) | set(medium_df['TRI_COMPANY_ID']) if len(medium_df) > 0 else set(all_high_confidence['TRI_COMPANY_ID'])
unmatched = tri_companies[~tri_companies['TRI_COMPANY_ID'].isin(all_matched_ids)]

print(f"\nUnmatched TRI companies: {len(unmatched):,}")
print(f"  % of total: {len(unmatched)/len(tri_companies)*100:.1f}%")

# Save
output_file = PROCESSED_PATH / 'unmatched_facilities.csv'
unmatched.to_csv(output_file, index=False)
print(f"\n✓ Saved: {output_file}")

print("\nSample unmatched:")
print(unmatched[['COMPANY_NAME', 'STATE_ABBR']].head(20).to_string(index=False))

print("\n⚠️  Note: Many unmatched are private companies not in CRSP")


IDENTIFYING UNMATCHED COMPANIES

Unmatched TRI companies: 7,920
  % of total: 32.6%

✓ Saved: /content/drive/MyDrive/Paper1_Dataset/processed/unmatched_facilities.csv

Sample unmatched:
                                         COMPANY_NAME STATE_ABBR
                    SOUTHEASTERN EXTRUSION & TOOL INC         AL
                           ALLEGAN METAL FINISHING CO         MI
                                            BASF CORP         CA
CMC STEEL FABRICATORS INC. DBA CMC CAPITOL CITY STEEL         TX
                 SURTECO N.A. INC. JEANNETTE PA PLANT         PA
                          HEIDELBERG MATERIALS US INC         AL
                           ROCHESTER PUBLIC UTILITIES         MN
                     AIR LIQUIDE ADVANCED SEPARATIONS         DE
                            CURTIS METAL FINISHING CO         MI
                                  HARTSVILLE OIL MILL         SC
                             CHEVRON PHOENIX TERMINAL         AZ
                  ASSOCIATED ELEC

## 9. Generate Summary Report

In [None]:
print("\n" + "="*80)
print("MATCHING SUMMARY REPORT")
print("="*80)

report = []
report.append("="*80)
report.append("COMPANY MATCHING SUMMARY REPORT")
report.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report.append("="*80)
report.append("")
report.append("OVERALL STATISTICS")
report.append("-"*80)
report.append(f"Total TRI companies: {len(tri_companies):,}")
report.append(f"Total CRSP companies: {len(crsp_companies):,}")
report.append("")
report.append("MATCH BREAKDOWN")
report.append("-"*80)
report.append(f"High-confidence (≥90%): {len(all_high_confidence):,} ({len(all_high_confidence)/len(tri_companies)*100:.1f}%)")
report.append(f"  - Exact: {len(exact_matches):,}")
report.append(f"  - Fuzzy: {len(fuzzy_high_df):,}")
report.append("")
report.append(f"Medium-confidence (70-89%): {len(medium_df):,} ({len(medium_df)/len(tri_companies)*100:.1f}%)")
report.append("  ⚠️  REQUIRES MANUAL REVIEW")
report.append("")
report.append(f"Unmatched (<70%): {len(unmatched):,} ({len(unmatched)/len(tri_companies)*100:.1f}%)")
report.append("  Note: Many are private companies not in CRSP")
report.append("")
report.append("="*80)
report.append("NEXT STEPS")
report.append("-"*80)
report.append("1. Review medium-confidence matches")
report.append("2. Accept/reject each match")
report.append("3. Create final TRI-CRSP crosswalk")
report.append("="*80)

report_text = "\n".join(report)
print(report_text)

# Save
report_file = PROCESSED_PATH / 'matching_summary_report.txt'
with open(report_file, 'w') as f:
    f.write(report_text)
print(f"\n✓ Saved: {report_file}")


MATCHING SUMMARY REPORT
COMPANY MATCHING SUMMARY REPORT
Generated: 2025-12-04 16:54:14

OVERALL STATISTICS
--------------------------------------------------------------------------------
Total TRI companies: 24,303
Total CRSP companies: 32,675

MATCH BREAKDOWN
--------------------------------------------------------------------------------
High-confidence (≥90%): 4,329 (17.8%)
  - Exact: 3,289
  - Fuzzy: 1,040

Medium-confidence (70-89%): 12,054 (49.6%)
  ⚠️  REQUIRES MANUAL REVIEW

Unmatched (<70%): 7,920 (32.6%)
  Note: Many are private companies not in CRSP

NEXT STEPS
--------------------------------------------------------------------------------
1. Review medium-confidence matches
2. Accept/reject each match
3. Create final TRI-CRSP crosswalk

✓ Saved: /content/drive/MyDrive/Paper1_Dataset/processed/matching_summary_report.txt


## 10. Create Final Output File

In [None]:
print("\n" + "="*80)
print("CREATING FINAL TRI DATA WITH MATCHES")
print("="*80)

# Combine all matches
all_matches = pd.concat([all_high_confidence, medium_df], ignore_index=True) if len(medium_df) > 0 else all_high_confidence

# Merge back to TRI data
tri_with_matches = tri_data.merge(
    all_matches[['TRI_COMPANY_ID', 'PERMNO', 'TICKER', 'CUSIP9', 'MATCH_SCORE', 'MATCH_TYPE']],
    left_on=tri_data.groupby('COMPANY_NAME_STD').ngroup(),
    right_on='TRI_COMPANY_ID',
    how='left'
)

print(f"\nFinal dataset:")
print(f"  Total records: {len(tri_with_matches):,}")
print(f"  Records with matches: {tri_with_matches['PERMNO'].notna().sum():,}")
print(f"  Match rate: {tri_with_matches['PERMNO'].notna().sum()/len(tri_with_matches)*100:.1f}%")

# Save
output_file = PROCESSED_PATH / 'tri_facilities_with_matches.parquet'
tri_with_matches.to_parquet(output_file, index=False)
print(f"\n✓ Saved: {output_file}")

print("\n" + "="*80)
print("✅ MATCHING COMPLETE!")
print("="*80)
print("\nFiles created:")
print("  1. matches_high_confidence.csv")
print("  2. matches_medium_confidence.csv")
print("  3. unmatched_facilities.csv")
print("  4. tri_facilities_with_matches.parquet")
print("  5. matching_summary_report.txt")
print("  6. county_fips_crosswalk.csv")


CREATING FINAL TRI DATA WITH MATCHES

Final dataset:
  Total records: 1,148,673
  Records with matches: 790,414
  Match rate: 68.8%

✓ Saved: /content/drive/MyDrive/Paper1_Dataset/processed/tri_facilities_with_matches.parquet

✅ MATCHING COMPLETE!

Files created:
  1. matches_high_confidence.csv
  2. matches_medium_confidence.csv
  3. unmatched_facilities.csv
  4. tri_facilities_with_matches.parquet
  5. matching_summary_report.txt
  6. county_fips_crosswalk.csv
