In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Read the two data files
goods_data = pd.read_csv('Clean_Goods_by_Company.csv')
child_labor_data = pd.read_excel('filtered_merged_child_labor_analysis.xlsx')

# Print columns to confirm data structure
print("Columns in Clean_Goods_by_Company:")
print(goods_data.columns)

print("Columns in filtered_merged_child_labor_analysis:")
print(child_labor_data.columns)

# Merge the two DataFrames on 'company_name'
merged_data = pd.merge(goods_data, child_labor_data, on='company_name', how='inner')

# Create binary indicator columns for existence of country types
merged_data['producing_countries'] = merged_data['manufacturing_country'].notna().astype(int)
merged_data['purchasing_countries'] = merged_data['sourced_from_country'].notna().astype(int)
merged_data['importing_countries'] = merged_data['importer_country'].notna().astype(int)
merged_data['exporting_countries'] = merged_data['exporter_country'].notna().astype(int)

# Set features (X) and target variable (y)
X = merged_data[['producing_countries', 'purchasing_countries', 'importing_countries', 'exporting_countries']]
y = merged_data['child_labor_score_sum']  # Ensure the target variable exists

# Normalize the feature data
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Get feature importances
importances = model.feature_importances_

# Print the calculated weights for each country type
weights = {
    'producing_countries': importances[0],
    'purchasing_countries': importances[1],
    'importing_countries': importances[2],
    'exporting_countries': importances[3]
}

# Output the weights
print("Calculated Weights for Countries:")
for country_type, weight in weights.items():
    print(f"{country_type}: {weight:.4f}")


Columns in Clean_Goods_by_Company:
Index(['Unnamed: 0', 'company_name', 'exporter_country', 'importer_country',
       'manufacturing_country', 'mentioned_country', 'operates_in_country',
       'sourced_from_country', 'goods_mentioned'],
      dtype='object')
Columns in filtered_merged_child_labor_analysis:
Index(['company_id', 'company_name', 'doc_score', 'sectors', 'sub_industries',
       'extracted_text', 'country_child_labor_flags', 'child_labor_score_sum'],
      dtype='object')
Calculated Weights for Countries:
producing_countries: 0.1073
purchasing_countries: 0.0517
importing_countries: 0.7651
exporting_countries: 0.0759
