In [32]:
import pandas as pd
import re
from pathlib import Path

# 1. Load metadata and filter long codes
metadata = pd.read_json('metadata.json')
metadata['total_length'] = metadata['code'].str.len() + metadata['description'].str.len()
long_codes = metadata[(metadata['total_length'] > 1000) & (metadata['total_length'] < 1200)].head(10)['code'].tolist()

print(f"Found {len(long_codes)} codes with length > 1000 and < 1500 characters")
print("Long codes: ", long_codes)
# 2. Load ESGish2 queries from Excel
try:
    queries_df = pd.read_excel('ESGish2.xlsx')
    # Assuming queries are in a column named 'query' - adjust if different
    #queries = queries_df[queries_df['Esgish'].str.len() < 150]['Esgish'].astype(str).tolist()
    queries = queries_df['Esgish'].astype(str).tolist()
    print(f"Loaded {len(queries)} queries from ESGish2.xlsx")
except Exception as e:
    print(f"Error loading ESGish2.xlsx: {e}")
    raise

# 3. Find queries containing our long codes
code_pattern = re.compile(r'\b(' + '|'.join(re.escape(code) for code in long_codes) + r')\b', re.IGNORECASE)
matched_queries = []

for i, query in enumerate(queries):
    matches = code_pattern.findall(query)
    if matches:
        matched_queries.append({
            'query_number': i+1,
            'query_text': query.strip(),
            'matched_codes': ', '.join(matches),
            'match_count': len(matches)
        })

# 4. Create comprehensive results
results = pd.DataFrame(matched_queries)
results.to_csv('res_len_from_1000_to_1200_len(10).csv', index=False)


Found 10 codes with length > 1000 and < 1500 characters
Long codes:  ['AnimalWelfareInvolvement', 'EUTaxAfforestationOverallAlign', 'EUTaxAfforestationOverallAlignA', 'EUTaxAfforestationOverallAlignM', 'EUTaxAirTransGrOpRevOverAlignM', 'EUTaxAluminiumOverallAlign', 'EUTaxAluminiumOverallAlignM', 'EUTaxAnaerDigeBioWasOverallAlign', 'EUTaxAnaerDigeBioWasOverallAlignM', 'EUTaxAnaerDigeSewageOverallAlign']
Loaded 17036 queries from ESGish2.xlsx
