In [97]:
import pandas as pd


cause_df = pd.read_csv('causes_with_jel.txt', header=None, names=['cause'], sep='\t')
effect_df = pd.read_csv('effects_with_jel.txt', header=None, names=['effect'], sep='\t')

#Make the first 11 chracters a seperate column
cause_df['jel_cause'] = cause_df['cause'].str[:11]
effect_df['jel_effect'] = effect_df['effect'].str[:11]

#Remove the first 5 characters (JEL: )
cause_df['jel_cause'] = cause_df['jel_cause'].str[5:]
effect_df['jel_effect'] = effect_df['jel_effect'].str[5:]

#Remove the first 11 characters (the JEL code and a space)
cause_df['cause'] = cause_df['cause'].str[11:]
effect_df['effect'] = effect_df['effect'].str[11:]

# Lowercase the text and remove special characters (other than parentheses)
cause_df['cause'] = cause_df['cause'].str.lower().str.replace(r'[^a-z0-9\s\(\)]', '', regex=True)
effect_df['effect'] = effect_df['effect'].str.lower().str.replace(r'[^a-z0-9\s\(\)]', '', regex=True)



print("--- Causes DataFrame ---")
print(cause_df.head())

print("\n--- Effects DataFrame ---")
print(effect_df.head())

--- Causes DataFrame ---
                                               cause jel_cause
0  job mobility (labor and demographic economics ...    J62 - 
1  specific training (business administration and...    M53 - 
2  job duration (mathematical and quantitative me...    C41 - 
3  income level (microeconomics general distribut...    D31 - 
4  higher income (microeconomics general distribu...    D31 - 

--- Effects DataFrame ---
                                              effect jel_effect
0  earnings growth (economic development innovati...     O49 - 
1  earnings growth (for less educated individuals...     I26 - 
2  job mobility and earnings growth (labor and de...     J62 - 
3  larger total earnings growth (labor and demogr...     J39 - 
4  propensity to live alone (urban rural regional...     R21 - 


In [98]:
# check to see if there are any apostrophe anomalies, such as the letter s as an entire word
print("\n--- Checking for apostrophe anomalies in Causes ---")
print(cause_df[cause_df['cause'].str.contains(r"\b's\b", regex=True)])
print("\n--- Checking for apostrophe anomalies in Effects ---")
print(effect_df[effect_df['effect'].str.contains(r"\b's\b", regex=True)])


--- Checking for apostrophe anomalies in Causes ---
Empty DataFrame
Columns: [cause, jel_cause]
Index: []

--- Checking for apostrophe anomalies in Effects ---
Empty DataFrame
Columns: [effect, jel_effect]
Index: []


In [99]:
#Tokenize the text using NLTK
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
cause_df['cause_tokens'] = cause_df['cause'].apply(word_tokenize)
effect_df['effect_tokens'] = effect_df['effect'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /home/etienne/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [100]:
#Use SentenceTransformer to generate embeddings for each the cause and effect.
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('BAAI/bge-large-en-v1.5')
cause_df['cause_embedding'] = cause_df['cause'].apply(lambda x: model.encode(x).tolist())
effect_df['effect_embedding'] = effect_df['effect'].apply(lambda x: model.encode(x).tolist())
print("\n--- Causes with Embeddings ---")
print(cause_df.head())
print("\n--- Effects with Embeddings ---")
print(effect_df.head())


--- Causes with Embeddings ---
                                               cause jel_cause  \
0  job mobility (labor and demographic economics ...    J62 -    
1  specific training (business administration and...    M53 -    
2  job duration (mathematical and quantitative me...    C41 -    
3  income level (microeconomics general distribut...    D31 -    
4  higher income (microeconomics general distribu...    D31 -    

                                        cause_tokens  \
0  [job, mobility, (, labor, and, demographic, ec...   
1  [specific, training, (, business, administrati...   
2  [job, duration, (, mathematical, and, quantita...   
3  [income, level, (, microeconomics, general, di...   
4  [higher, income, (, microeconomics, general, d...   

                                     cause_embedding  
0  [-0.01640280708670616, 0.007269580382853746, -...  
1  [0.012148850597441196, 0.023789776489138603, -...  
2  [0.009822586551308632, 0.02287955768406391, -0...  
3  [-0.0207846

In [101]:
# Save the embeddings to a csv
cause_df.to_csv('cause_embeddings.csv', index=False)
effect_df.to_csv('effect_embeddings.csv', index=False)

In [132]:
# Take a sample sentence and generate its embedding
sample_sentence = "Labour disruptions in Canada"
sample_embedding = model.encode(sample_sentence).tolist()
print(f"\nSample Sentence: {sample_sentence}")
print(f"Sample Embedding: {sample_embedding}")


Sample Sentence: Labour disruptions in Canada
Sample Embedding: [-0.019970569759607315, -0.0023622591979801655, -0.04118318855762482, 0.021841993555426598, 0.00020209577633067966, -0.03272314369678497, -0.004295450169593096, -0.011540118604898453, 0.021282963454723358, 0.07546985894441605, 0.011179343797266483, 0.03844108060002327, -0.012067539617419243, -0.03874942287802696, -0.0026552630588412285, -0.013095649890601635, -0.04041828587651253, -0.033384472131729126, -0.012299680151045322, 0.03140386566519737, 0.014100807718932629, -0.04606326296925545, -0.021025866270065308, -0.06851112842559814, 0.004398063290864229, 0.025733141228556633, -0.014115634374320507, -0.059316717088222504, 0.048685114830732346, 0.06282421946525574, -0.04376908019185066, -0.027806874364614487, -0.012487949803471565, -0.045125119388103485, -0.013649088330566883, -0.039080869406461716, 0.0017843031091615558, -0.0364767424762249, -0.023214440792798996, -0.020994000136852264, -0.022532083094120026, -0.042779181

In [133]:
# Perform a kNN search to find the most similar causes to the sample sentence
from sklearn.neighbors import NearestNeighbors
import numpy as np
cause_embeddings = np.array(cause_df['cause_embedding'].tolist())
nbrs = NearestNeighbors(n_neighbors=10, algorithm='auto').fit(cause_embeddings)
distances, indices = nbrs.kneighbors([sample_embedding])
print("\n--- Nearest Neighbors to Sample Sentence ---")
for i, index in enumerate(indices[0]):
    print(f"Rank {i+1}: {cause_df.iloc[index]['cause']} (Distance: {distances[0][i]})")

# Save the results to a variable
neighbors = [
    {"cause": cause_df.iloc[index]['cause'], "distance": distances[0][i]}
    for i, index in enumerate(indices[0])
]


--- Nearest Neighbors to Sample Sentence ---
Rank 1: closing of the frontier (economic history general regional and urban history general regional and urban history us canada pre1913) (Distance: 0.8041261660865661)
Rank 2: job destruction shocks (labor and demographic economics general mobility unemployment vacancies and immigrant workers general labor turnover vacancies layoffs guideline covers studies about issues related to employment status and conditions from both workers and employers perspectives including separations hires redundancy job tenure job security and displacement keywords dismissal displaced firing hiring job turnover labor turnover layoffs plant closing quit rates quitting termination vacancies) (Distance: 0.8849342980724509)
Rank 3: heterogeneous workers (labor and demographic economics general demand and supply of labor general time allocation work behavior and employment determination other keywords labor market workplace) (Distance: 0.89389335221586)
Rank 4: ne

In [134]:

# Load the Original Dataset and Prepare for Lookup 
df_causal = pd.read_csv('causal_claims_processed.csv')

# print("--- Applying Robust Cleaning to JEL Codes ---")
# # For the unique causes lookup table
# cause_df['jel_cause'] = cause_df['jel_cause'].str.replace(' -', '', regex=False).str.strip()

# # For the original, full dataset
# df_causal['Cause_JEL'] = df_causal['Cause_JEL'].str.replace(' -', '', regex=False).str.strip()
# df_causal['Effect_JEL'] = df_causal['Effect_JEL'].str.replace(' -', '', regex=False).str.strip()


# # --- (Optional but Recommended) Diagnostic Check ---
# # Let's look at the first few values from each column to confirm they are clean.
# print("\nCleaned JEL codes from cause_df:\n", cause_df['jel_cause'].head())
# print("\nCleaned JEL codes from df_causal:\n", df_causal['Cause_JEL'].head())



# print("\n\n--- Finding Causal Links for Top kNN Results ---")

# Loop through each neighbor found by the kNN search
for i, neighbor in enumerate(neighbors):
    long_cause_text = neighbor['cause']
    distance = neighbor['distance']

    print("--------------------------------------------------")
    print(f"Matching Rank {i+1} Cause (Distance: {distance:.4f})")
    print(f"-> Text: \"{long_cause_text[:80]}...\"") # Truncate for readability

    # Look up the long cause text in cause_df to find its JEL code
    cause_row = cause_df[cause_df['cause'] == long_cause_text]

    if cause_row.empty:
        print("  -> Could not find this cause text in the `cause_df`. Skipping.")
        continue

    # Get the (now clean) JEL code
    jel_to_find = cause_row['jel_cause'].iloc[0]
    print(f"-> Found corresponding JEL Code: '{jel_to_find}'")

    # Use the clean JEL code to find links in the (now clean) original dataset
    linked_effects_df = df_causal[df_causal['Cause_JEL'] == jel_to_find]

    # Display the results
    if linked_effects_df.empty:
        print("  -> No direct causal links found for this JEL code in the dataset.")
    else:
        # Get unique effects to avoid printing the same link multiple times
        unique_effects = linked_effects_df[['Effect_JEL', 'Effect_Text', 'Claim', 'Paper_ID']].drop_duplicates()
        
        print(f"  -> Found {len(unique_effects)} unique linked effect(s):")
        for _, effect_row in unique_effects.iterrows():
            print(f"    - Effect: \"{effect_row['Effect_Text']}\" (JEL: {effect_row['Effect_JEL']})")
            print(f"      Claim: '{effect_row['Claim']}' (Source Paper: {effect_row['Paper_ID']})")

print("--------------------------------------------------")

--------------------------------------------------
Matching Rank 1 Cause (Distance: 0.8041)
-> Text: "closing of the frontier (economic history general regional and urban history gen..."
-> Found corresponding JEL Code: 'N91'
  -> Found 1 unique linked effect(s):
    - Effect: "reduced mobility" (JEL: J62)
      Claim: 'closing of the frontier -> reduced mobility' (Source Paper: w11324)
--------------------------------------------------
Matching Rank 2 Cause (Distance: 0.8849)
-> Text: "job destruction shocks (labor and demographic economics general mobility unemplo..."
-> Found corresponding JEL Code: 'J63'
  -> Found 3 unique linked effect(s):
    - Effect: "unemployment dynamics" (JEL: J64)
      Claim: 'job destruction shocks -> unemployment dynamics' (Source Paper: w11692)
    - Effect: "shorter job tenure" (JEL: J63)
      Claim: 'higher incidence of job changes -> shorter job tenure' (Source Paper: w11808)
    - Effect: "union voting behavior" (JEL: J51)
      Claim: 'perceived 

In [None]:
# USE LLM AS A DICTIONARY.

# Explain the different situations and give a conclusion.

# Try using something different than textual similarity.

# Create new links between cause and effects (not listed in the data already).

# Add chain of thoughts into the question. Link from the question to the concept. (When there is a policy change, what kind of aspects do we want to learn. Is there an impact on the incomes of people? etc.)