In [24]:
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [25]:
#Read csv
file_path = 'Cleaned_data_with_population.csv'

df = pd.read_csv(file_path)

# Sort by LOCATION and TIME for correct growth calculation
df.sort_values(['LOCATION', 'TIME'], inplace=True)
# Calculate growth rates for selected columns
growth_columns = ['PC_HEALTHXP', 'PC_GDP', 'USD_CAP','TOTAL_SPEND','POPULATION']
for col in growth_columns:
    df[f'{col}_GROWTH'] = df.groupby('LOCATION')[col].pct_change()

# Identify the column with the highest growth each year for each location
df['HIGHEST_GROWTH_METRIC'] = df[[f'{col}_GROWTH' for col in growth_columns]].idxmax(axis=1)
df['HIGHEST_GROWTH_VALUE'] = df[[f'{col}_GROWTH' for col in growth_columns]].max(axis=1)

# Optional: clean the metric name (remove "_GROWTH" suffix)
df['HIGHEST_GROWTH_METRIC'] = df['HIGHEST_GROWTH_METRIC'].str.replace('_GROWTH', '')

# Save the result to csv
df.to_csv('Cleaned_data_with_population_growth.csv', index=False)

df



The behavior of DataFrame.idxmax with all-NA values, or any-NA and skipna=False, is deprecated. In a future version this will raise ValueError



Unnamed: 0,LOCATION,TIME,PC_HEALTHXP,PC_GDP,USD_CAP,TOTAL_SPEND,POPULATION,PC_HEALTHXP_GROWTH,PC_GDP_GROWTH,USD_CAP_GROWTH,TOTAL_SPEND_GROWTH,POPULATION_GROWTH,HIGHEST_GROWTH_METRIC,HIGHEST_GROWTH_VALUE
0,AUS,2011,15.311,1.307,583.222,13029.19,22340024.0,,,,,,,
1,AUS,2012,15.308,1.328,589.993,13412.59,22733465.0,-0.000196,0.016067,0.011610,0.029426,0.017611,TOTAL_SPEND,0.029426
2,AUS,2013,14.345,1.255,586.402,13562.38,23128129.0,-0.062908,-0.054970,-0.006087,0.011168,0.017360,POPULATION,0.017360
3,AUS,2014,12.498,1.229,569.455,13368.35,23475686.0,-0.128756,-0.020717,-0.028900,-0.014306,0.015027,POPULATION,0.015027
4,AUS,2015,12.442,1.267,594.233,14152.25,23815995.0,-0.004481,0.030919,0.043512,0.058639,0.014496,TOTAL_SPEND,0.058639
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
372,USA,2016,12.263,2.060,1192.301,385198.78,323071755.0,-0.034941,-0.016706,0.002408,0.009698,0.007273,TOTAL_SPEND,0.009698
373,USA,2017,11.954,2.004,1200.769,390396.57,325122128.0,-0.025198,-0.027184,0.007102,0.013494,0.006346,TOTAL_SPEND,0.013494
374,USA,2018,11.766,1.957,1229.266,401771.09,326838199.0,-0.015727,-0.023453,0.023732,0.029136,0.005278,TOTAL_SPEND,0.029136
375,USA,2019,11.767,1.961,1277.032,419287.86,328329953.0,0.000085,0.002044,0.038857,0.043599,0.004564,TOTAL_SPEND,0.043599


In [26]:
# Calculate average growth and total spend per location
summary = df.groupby('LOCATION').agg({
    'USD_CAP_GROWTH': 'mean',
    'TOTAL_SPEND': 'mean'
}).reset_index()

# Rename for clarity
summary.rename(columns={
    'USD_CAP_GROWTH': 'AVG_USD_CAP_GROWTH',
    'TOTAL_SPEND': 'AVG_TOTAL_SPEND'
}, inplace=True)

# Determine thresholds (top 25% for growth, bottom 25% for spend)
growth_threshold = summary['AVG_USD_CAP_GROWTH'].quantile(0.75)
spend_threshold = summary['AVG_TOTAL_SPEND'].quantile(0.25)

# Identify countries of interest
summary["TARGET"] = summary.apply(
    lambda row: "Potential" if (row["AVG_USD_CAP_GROWTH"] >= growth_threshold and row["AVG_TOTAL_SPEND"] <= spend_threshold)
    else "Other",
    axis=1
)

# Interactive Plotly scatter plot
fig = px.scatter(
    summary,
    x="AVG_TOTAL_SPEND",
    y="AVG_USD_CAP_GROWTH",
    color="TARGET",
    text="LOCATION",
    hover_data=["LOCATION", "AVG_USD_CAP_GROWTH", "AVG_TOTAL_SPEND"],
    color_discrete_map={"Potential": "red", "Other": "gray"},
    labels={
        "AVG_TOTAL_SPEND": "Avg Total Spend",
        "AVG_USD_CAP_GROWTH": "Avg USD_CAP Growth Rate",
        "TARGET": "Country Type"
    },
    title="Countries with High USD_CAP Growth and Low Total Spend"
)

# Improve layout
fig.update_traces(textposition='top center')
fig.update_layout(legend=dict(title="Target Country"))
fig.show()