We need to create a simple pipeline to use for standardization of the dataset (that will be used later for standardisation of user inputs)

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


In [2]:
df = pd.read_csv("../raw_data/merged_country_level/final_merged_dataset_with_knn.csv").set_index("Unnamed: 0")
df.index.name = "country"

print(df.shape)
df.head()


(152, 5)


Unnamed: 0_level_0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
afghanistan,960.545,18.1,3.88,24.9,24.24
albania,518.916429,22.2,81.41,55.3,48.21
algeria,356.0455,22.8,16.54,47.4,54.43
andorra,1257.46,11.1,113.413333,84.7,75.56
angola,740.635,27.1,22.91,33.7,36.58


In [3]:
df.describe()

Unnamed: 0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
count,152.0,152.0,152.0,152.0,152.0
mean,1041.129431,21.048465,100.274408,54.020833,58.7
std,794.654387,6.84672,79.310999,15.223158,13.368145
min,227.355,1.2,3.23,19.3,23.86
25%,576.533609,14.991667,38.78,43.425,48.575
50%,803.927917,22.45,82.9,53.15,59.34
75%,1262.171023,26.925,148.0175,66.45,68.58
max,6790.835,32.1,345.33,84.7,86.5


In [4]:
df.loc['russia']

average_monthly_cost_$        774.039286
average_yearly_temperature     11.800000
internet_speed_mbps            90.400000
safety_index                   61.300000
Healthcare Index               61.410000
Name: russia, dtype: float64

In [5]:
#pipe with column transformer and minmaxscaler
from sklearn.compose import ColumnTransformer

# Define the column transformer with MinMaxScaler for all columns
column_transformer = ColumnTransformer(
    transformers=[
        ("minmax", MinMaxScaler(), df.columns)
    ]
)

# Create a new pipeline with the column transformer
pipeline = Pipeline(steps=[
    ("column_transformer", column_transformer)
])

# Fit the pipeline to the data
df_scaled = pipeline.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

df_scaled.head()

Unnamed: 0_level_0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
afghanistan,0.111708,0.546926,0.0019,0.085627,0.006066
albania,0.044422,0.679612,0.22853,0.550459,0.388729
algeria,0.019607,0.699029,0.038907,0.429664,0.488027
andorra,0.156945,0.320388,0.322079,1.0,0.825351
angola,0.078202,0.838188,0.057527,0.220183,0.203065


In [6]:
df_scaled.describe()

Unnamed: 0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
count,152.0,152.0,152.0,152.0,152.0
mean,0.123985,0.642345,0.283673,0.5309,0.556194
std,0.121072,0.221577,0.231836,0.23277,0.213412
min,0.0,0.0,0.0,0.0,0.0
25%,0.0532,0.446332,0.103917,0.368884,0.394556
50%,0.087846,0.687702,0.232885,0.517584,0.566411
75%,0.157663,0.832524,0.423232,0.720948,0.713921
max,1.0,1.0,1.0,1.0,1.0


In [7]:
df_scaled.to_csv("../raw_data/merged_country_level/scaled_merged_data_after_imputation.csv")

In [8]:
import pickle
with open('../models/scaling_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

### extra section: figure out how to scale and extra column

In [9]:
print(pipeline.named_steps.keys())  # This will show you the steps in your pipeline

dict_keys(['column_transformer'])


In [10]:
user_input_dict = {}
user_input_dict["max_monthly_budget"] = 1257.4
normalized_inputs = {}

In [11]:
# Get the column transformer
column_transformer = pipeline.named_steps['column_transformer']
max_monthly_budget = user_input_dict["max_monthly_budget"]
        
# Transform the max monthly budget directly (it will apply the appropriate MinMaxScaler)
budget_transformed = column_transformer.transform([[max_monthly_budget, 0, 0, 0, 0]])[0][0]
normalized_inputs["max_monthly_budget"] = budget_transformed
normalized_inputs

ValueError: Specifying the columns using strings is only supported for dataframes.

In [None]:

cost_scale = cost_scaler.scale_[0]  # The scale (std dev) for the first column (monthly cost)
cost_mean = cost_scaler.mean_[0]    # The mean for the first column (monthly cost)


# Apply the same scaling to max_monthly_budget
normalized_max_monthly_budget = (user_input_dict["max_monthly_budget"] - cost_mean) / cost_scale
normalized_inputs["max_monthly_budget"] = normalized_max_monthly_budget