In [23]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pycountry import countries
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans


In [28]:
nr = pd.read_csv(r"C:\Users\ayaan\OneDrive - London School of Economics\Academics\MPA_DSPP\Moody's - Capstone Project\Capstone-Ayaan-ASUS\MASTER\NaturalResource.csv", index_col=0)
#nr = nr[nr["Country"] != "China"]

In [29]:
df_pivot = nr.pivot_table(
    index=['Country', 'Country Code', 'Year', 'Population'],      # Rows
    columns='Resource',              # Columns (each resource becomes a column)
    values='Production_TotalValue' # Values to fill
).reset_index()

#resource_cols = df_pivot.columns.difference(['Country', 'Year', 'Population'])
#df_pivot[resource_cols] = df_pivot[resource_cols].div(df_pivot['Population'], axis=0)
df_pivot.drop(columns="Population", inplace=True)

df_pivot = df_pivot.fillna(0)
df_pivot

Resource,Country,Country Code,Year,Aluminium,Bauxite,Cadmium,Coal,Cobalt,Copper,Gold,...,Manganese,Natural Gas,Natural Graphite,Nickel,Oil,Rare Earth,Silver,Tin,Vanadium,Zinc
0,Albania,ALB,1995.0,0.0,38854.8,0.0,4.810329e+06,0.0,1.189900e+07,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Albania,ALB,1996.0,0.0,94302.0,0.0,3.739706e+06,0.0,6.575000e+06,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Albania,ALB,1997.0,0.0,109989.1,0.0,1.469604e+06,0.0,7.776000e+05,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Albania,ALB,1998.0,0.0,92467.2,0.0,1.423717e+06,0.0,3.979000e+06,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Albania,ALB,1999.0,0.0,95254.4,0.0,8.098382e+05,0.0,1.425160e+06,0.000000e+00,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3314,Zimbabwe,ZWE,2017.0,0.0,0.0,0.0,1.274859e+08,0.0,3.694702e+07,6.460830e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,540200.0,0.0,0.0,0.0
3315,Zimbabwe,ZWE,2018.0,0.0,0.0,0.0,1.705706e+08,0.0,3.884956e+07,9.289310e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,505776.0,0.0,0.0,0.0
3316,Zimbabwe,ZWE,2019.0,0.0,0.0,0.0,9.490413e+07,0.0,3.419100e+07,8.416694e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,486718.0,0.0,0.0,0.0
3317,Zimbabwe,ZWE,2020.0,0.0,0.0,0.0,7.231872e+07,0.0,3.157334e+07,7.493407e+08,...,0.0,0.0,0.0,0.0,0.0,0.0,633152.0,0.0,0.0,0.0


In [30]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# =============================================================================
# STEP 1: Prepare data (get latest year per country)
# =============================================================================

df_latest = df_pivot.sort_values('Year', ascending=True).groupby(['Country', 'Country Code']).first().reset_index() # Ascending = False for 2021
feature_cols = [c for c in df_latest.columns if c not in ['Country', 'Country Code', 'Year']]

# =============================================================================
# STEP 2: Log transform to reduce outlier influence
# =============================================================================

X = df_latest[feature_cols].fillna(0)
# Log transform: log(1 + x) to handle zeros
X_log = np.log1p(X)

# =============================================================================
# STEP 3: Scale the log-transformed data
# =============================================================================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_log)

# =============================================================================
# STEP 4: PCA - Dimensionality Reduction
# =============================================================================

pca = PCA(n_components=3)
pca_components = pca.fit_transform(X_scaled)

print(f"\nPCA Explained Variance:")
for i, var in enumerate(pca.explained_variance_ratio_):
    cumulative = pca.explained_variance_ratio_[:i+1].sum()
    print(f"  PC{i+1}: {var*100:.1f}% (cumulative: {cumulative*100:.1f}%)")

n_components_for_clustering = 3
X_pca = pca_components[:, :n_components_for_clustering]

print(f"\nUsing {n_components_for_clustering} PCs for clustering")
print(f"  Explains {pca.explained_variance_ratio_[:n_components_for_clustering].sum()*100:.1f}% of variance")

# =============================================================================
# STEP 5: K-Means Clustering on PCA Components
# =============================================================================

n_clusters = 6
kmeans = KMeans(n_clusters=n_clusters, n_init=10)
clusters = kmeans.fit_predict(X_pca)

# =============================================================================
# STEP 6: Create results dataframe
# =============================================================================

pca_df = pd.DataFrame({
    'Country': df_latest['Country'],
    'Country Code': df_latest['Country Code'],
    'Year': df_latest['Year'],
    'PC1': pca_components[:, 0],
    'PC2': pca_components[:, 1],
    'PC3': pca_components[:, 2],
    'Cluster': clusters.astype(str)
})

# =============================================================================
# STEP 7: Calculate PCA loadings for biplot arrows
# =============================================================================

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loadings_df = pd.DataFrame(
    loadings[:, :2],
    columns=['PC1', 'PC2'],
    index=feature_cols
)

# Scale loadings for visualization
scale_factor = 2.5
loadings_df_scaled = loadings_df * scale_factor

# Top influential features
top_n = 15
loading_importance = loadings_df.abs().sum(axis=1)
top_features = loading_importance.nlargest(top_n).index

print(f"\nTop {top_n} most influential resources:")
for feat in top_features:
    print(f"  {feat}: {loading_importance[feat]:.3f}")

# =============================================================================
# STEP 8: Create the biplot
# =============================================================================

fig = px.scatter(
    pca_df,
    x='PC1',
    y='PC2',
    color='Cluster',
    hover_data=['Country', 'Country Code', 'Year'],
    title=f'PCA Biplot of K-Means Clusters (k={n_clusters})<br><sup>Log-transformed data | Clustering on PC1-PC{n_components_for_clustering}</sup>',
    color_discrete_sequence=px.colors.qualitative.Bold
)

# Add loading arrows
for feature in top_features:
    fig.add_annotation(
        x=loadings_df_scaled.loc[feature, 'PC1'],
        y=loadings_df_scaled.loc[feature, 'PC2'],
        ax=0, ay=0,
        xref='x', yref='y',
        axref='x', ayref='y',
        showarrow=True,
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor='black'
    )
    
    fig.add_annotation(
        x=loadings_df_scaled.loc[feature, 'PC1'] * 1.15,
        y=loadings_df_scaled.loc[feature, 'PC2'] * 1.15,
        text=feature,
        showarrow=False,
        font=dict(size=10, color='black'),
        xref='x', yref='y'
    )

fig.update_layout(
    width=1000,
    height=700,
    xaxis_title=f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)",
    yaxis_title=f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)",
)

fig.show()


PCA Explained Variance:
  PC1: 35.3% (cumulative: 35.3%)
  PC2: 9.8% (cumulative: 45.1%)
  PC3: 8.8% (cumulative: 53.8%)

Using 3 PCs for clustering
  Explains 53.8% of variance

Top 15 most influential resources:
  Silver: 1.234
  Zinc: 1.208
  Lead: 1.187
  Copper: 1.155
  Rare Earth: 1.148
  Vanadium: 0.948
  Nickel: 0.871
  Magnesium compounds: 0.865
  Gold: 0.857
  Coal: 0.824
  Bauxite: 0.807
  Natural Graphite: 0.790
  Manganese: 0.773
  Natural Gas: 0.732
  Aluminium: 0.719


In [31]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from pycountry import countries

# =============================================================================
# STEP 1: Create hover text with top resources
# =============================================================================

def create_hover_text(row, df_latest, feature_cols):
    """Create informative hover text for each country."""
    
    country = row['Country']
    cluster = row['Cluster']
    
    # Get the country's resource data
    country_data = df_latest[df_latest['Country'] == country][feature_cols].iloc[0]
    
    # Get top 3 resources by value
    top_resources = country_data.nlargest(3)
    top_resources = top_resources[top_resources > 0]
    
    lines = [
        f"<b>{country}</b>",
        f"Cluster: {cluster}",
        "",
        "Top Resources:"
    ]
    
    for resource, value in top_resources.items():
        if value > 1e9:
            val_str = f"${value/1e9:.1f}B"
        elif value > 1e6:
            val_str = f"${value/1e6:.1f}M"
        elif value > 1e3:
            val_str = f"${value/1e3:.1f}K"
        else:
            val_str = f"${value:.0f}"
        lines.append(f"  {resource}: {val_str}")
    
    if len(top_resources) == 0:
        lines.append("  (No significant production)")
    
    return "<br>".join(lines)

# Add hover text
pca_df['hover_text'] = pca_df.apply(
    lambda row: create_hover_text(row, df_latest, feature_cols), 
    axis=1
)

# =============================================================================
# STEP 2: Create the choropleth map
# =============================================================================

# Filter to countries with valid ISO codes
df_map = pca_df[pca_df['Country Code'].notna()].copy()

# Convert cluster to integer for proper color mapping
df_map['Cluster_int'] = df_map['Cluster'].astype(int)

fig = go.Figure()

# Add each cluster as a separate trace for legend control
colors = px.colors.qualitative.Bold
n_clusters = df_map['Cluster_int'].nunique()

for cluster_id in sorted(df_map['Cluster_int'].unique()):
    subset = df_map[df_map['Cluster_int'] == cluster_id]
    
    fig.add_trace(go.Choropleth(
        locations=subset['Country Code'],
        z=[cluster_id] * len(subset),
        colorscale=[[0, colors[cluster_id]], [1, colors[cluster_id]]],
        showscale=False,
        customdata=subset['hover_text'].values,
        hovertemplate="%{customdata}<extra></extra>",
        name=f"Cluster {cluster_id} ({len(subset)})",
        marker=dict(line=dict(color='white', width=0.5))
    ))

fig.update_geos(
    projection_type="natural earth",
    showcountries=True,
    showcoastlines=True,
    countrycolor="lightgray",
    coastlinecolor="lightgray",
    showland=True,
    landcolor="whitesmoke"
)

fig.update_layout(
    title=f"Resource Production Clusters (k={n_clusters})<br><sup>Log-transformed production values | PCA + K-Means</sup>",
    width=1200,
    height=700,
    legend=dict(
        x=0.02,
        y=0.5,
        bgcolor="rgba(255,255,255,0.8)",
        title="Clusters"
    ),
    geo=dict(
        showframe=False,
        showocean=True,
        oceancolor="lightblue"
    )
)

fig.show()

# =============================================================================
# STEP 3: Print cluster summary with geography
# =============================================================================

print("\n" + "="*70)
print("CLUSTER GEOGRAPHIC SUMMARY")
print("="*70)

for c in sorted(df_map['Cluster_int'].unique()):
    cluster_countries = df_map[df_map['Cluster_int'] == c]['Country'].tolist()
    print(f"\nCluster {c} ({len(cluster_countries)} countries):")
    print(f"  {', '.join(sorted(cluster_countries))}")


CLUSTER GEOGRAPHIC SUMMARY

Cluster 0 (55 countries):
  Albania, Angola, Armenia, Austria, Bangladesh, Bosnia and Herzegovina, Botswana, Burkina Faso, Cameroon, Chad, Congo, Rep., Costa Rica, Cote d'Ivoire, Croatia, Cyprus, Czechia, Dominican Republic, Ecuador, Equatorial Guinea, Eswatini, Ethiopia, Gabon, Ghana, Guatemala, Guinea, Hungary, Israel, Jamaica, Kenya, Kyrgyz Republic, Lao PDR, Liberia, Madagascar, Malawi, Mali, Mauritania, Mozambique, Nepal, Nicaragua, Niger, Panama, Rwanda, Senegal, Serbia, Slovak Republic, Slovenia, Sri Lanka, Sudan, Switzerland, Tajikistan, Tanzania, Togo, Uganda, Uruguay, Yemen, Rep.

Cluster 1 (4 countries):
  Australia, Brazil, China, Russian Federation

Cluster 2 (26 countries):
  Bolivia, Bulgaria, Chile, Colombia, Congo, Dem. Rep., Finland, France, Georgia, Greece, Honduras, Ireland, Korea, Rep., Mongolia, Morocco, Myanmar, Namibia, New Zealand, North Macedonia, Papua New Guinea, Philippines, Portugal, Spain, Sweden, Tunisia, Zambia, Zimbabwe

Cl