We need to create a simple pipeline to use for standardization of the dataset (that will be used later for standardisation of user inputs)

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


In [4]:
df = pd.read_csv("../raw_data/merged_country_level/merged_dataset_with_knn.csv").set_index("Unnamed: 0")
df.index.name = "country"

print(df.shape)
df.head(20)


(157, 5)


Unnamed: 0_level_0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
afghanistan,960.545,18.1,3.88,24.9,24.24
albania,518.916429,22.2,81.41,55.3,48.21
algeria,356.0455,22.8,16.54,47.4,54.43
angola,740.635,27.1,22.91,33.7,36.58
argentina,503.73125,15.1,93.38,36.6,68.0
armenia,1347.52,18.4,60.72,77.9,58.07
australia,1870.925,12.4,80.49,52.7,73.35
austria,1517.2375,14.6,100.45,70.5,77.83
azerbaijan,410.863333,19.0,69.32,68.2,48.31
bahamas,2080.87,25.3,69.93,43.1,39.52


In [3]:
df.describe()

Unnamed: 0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
count,157.0,157.0,157.0,157.0,157.0
mean,1058.652989,21.042251,100.97138,54.108705,59.137834
std,811.244804,6.818878,79.347168,15.27638,13.453259
min,227.355,1.2,3.23,19.3,23.86
25%,582.27,15.0,38.45,43.533333,49.83
50%,806.3,22.3,83.13,53.1,60.66
75%,1276.304091,26.9,148.37,66.9,68.82
max,6790.835,32.1,345.33,84.7,86.5


In [4]:
#pipe with column transformer and minmaxscaler
from sklearn.compose import ColumnTransformer

# Define the column transformer with MinMaxScaler for all columns
column_transformer = ColumnTransformer(
    transformers=[
        ("minmax", MinMaxScaler(), df.columns)
    ]
)

# Create a new pipeline with the column transformer
pipeline = Pipeline(steps=[
    ("column_transformer", column_transformer)
])

# Fit the pipeline to the data
df_scaled = pipeline.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

df_scaled.head(20)

Unnamed: 0_level_0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
afghanistan,0.111708,0.546926,0.0019,0.085627,0.006066
albania,0.044422,0.679612,0.22853,0.550459,0.388729
algeria,0.019607,0.699029,0.038907,0.429664,0.488027
angola,0.078202,0.838188,0.057527,0.220183,0.203065
argentina,0.042108,0.449838,0.263519,0.264526,0.704662
armenia,0.170666,0.556634,0.16805,0.896024,0.546137
australia,0.250411,0.36246,0.22584,0.510703,0.79007
austria,0.196524,0.433657,0.284186,0.782875,0.86159
azerbaijan,0.027959,0.576052,0.193189,0.747706,0.390326
bahamas,0.282398,0.779935,0.194972,0.363914,0.25


In [5]:
df_scaled.describe()

Unnamed: 0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
count,157.0,157.0,157.0,157.0,157.0
mean,0.126655,0.642144,0.28571,0.532243,0.563184
std,0.1236,0.220676,0.231941,0.233584,0.214771
min,0.0,0.0,0.0,0.0,0.0
25%,0.054074,0.446602,0.102952,0.37054,0.414591
50%,0.088207,0.682848,0.233557,0.51682,0.587484
75%,0.159816,0.831715,0.424262,0.727829,0.717752
max,1.0,1.0,1.0,1.0,1.0


In [6]:
df_scaled.to_csv("../raw_data/merged_country_level/scaled_merged_data_after_imputation.csv")

In [7]:
import pickle
with open('../models/scaling_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

### extra section: figure out how to scale and extra column

In [8]:
print(pipeline.named_steps.keys())  # This will show you the steps in your pipeline

dict_keys(['column_transformer'])


In [9]:
user_input_dict = {}
user_input_dict["max_monthly_budget"] = 1257.4
normalized_inputs = {}

In [10]:
# Get the column transformer
column_transformer = pipeline.named_steps['column_transformer']
max_monthly_budget = user_input_dict["max_monthly_budget"]
        
# Transform the max monthly budget directly (it will apply the appropriate MinMaxScaler)
budget_transformed = column_transformer.transform([[max_monthly_budget, 0, 0, 0, 0]])[0][0]
normalized_inputs["max_monthly_budget"] = budget_transformed
normalized_inputs

ValueError: Specifying the columns using strings is only supported for dataframes.

In [None]:

cost_scale = cost_scaler.scale_[0]  # The scale (std dev) for the first column (monthly cost)
cost_mean = cost_scaler.mean_[0]    # The mean for the first column (monthly cost)


# Apply the same scaling to max_monthly_budget
normalized_max_monthly_budget = (user_input_dict["max_monthly_budget"] - cost_mean) / cost_scale
normalized_inputs["max_monthly_budget"] = normalized_max_monthly_budget

In [5]:
countries_list = df.index.tolist()
print(countries_list)


['afghanistan', 'albania', 'algeria', 'angola', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'bolivia', 'bosnia & herzegovina', 'botswana', 'brazil', 'brunei', 'bulgaria', 'cambodia', 'cameroon', 'canada', 'chile', 'china', 'colombia', 'congo', 'costa rica', 'croatia', 'cuba', 'cyprus', 'czech republic', 'denmark', 'dominican republic', 'ecuador', 'egypt', 'estonia', 'ethiopia', 'finland', 'france', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guyana', 'honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'kuwait', 'latvia', 'lebanon', 'libya', 'liechtenstein', 'lithuania', 'luxembourg', 'malaysia', 'maldives', 'malta', 'mauritius', 'mexico', 'mongolia', 'montenegro', 'morocco', 'mozambique', 'namibia', 'nepal', 'netherlands', 'new zealand', 'nicaragua', 'nigeria', 'norway', 'oman'