We need to create a simple pipeline to use for standardization of the dataset (that will be used later for standardisation of user inputs)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
import pandas as pd


In [3]:
df = pd.read_csv("../raw_data/merged_country_level/merged_dataset_with_knn.csv").set_index("Unnamed: 0")
df.index.name = "country"

print(df.shape)
df.head()


(155, 5)


Unnamed: 0_level_0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
afghanistan,960.545,18.1,3.88,24.9,24.24
albania,518.916429,22.2,81.41,55.3,48.21
algeria,356.0455,22.8,16.54,47.4,54.43
andorra,1257.46,11.1,113.413333,84.7,75.56
angola,740.635,27.1,22.91,33.7,36.58


In [4]:
df.describe()

Unnamed: 0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
count,155.0,155.0,155.0,155.0,155.0
mean,1085.577345,21.283656,99.077806,53.684301,58.613742
std,848.482198,6.835107,78.998202,15.264927,13.547936
min,227.355,1.2,3.23,19.3,23.86
25%,579.861406,15.35,38.45,41.966667,48.55
50%,807.235,23.0,81.78,53.0,59.38
75%,1327.921731,27.0,147.25,66.05,68.55
max,6790.835,32.1,345.33,84.7,86.5


In [10]:
#pipe with column transformer and minmaxscaler
from sklearn.compose import ColumnTransformer

# Define the column transformer with MinMaxScaler for all columns
column_transformer = ColumnTransformer(
    transformers=[
        ("minmax", MinMaxScaler(), df.columns)
    ]
)

# Create a new pipeline with the column transformer
pipeline = Pipeline(steps=[
    ("column_transformer", column_transformer)
])

# Fit the pipeline to the data
df_scaled = pipeline.fit_transform(df)

df_scaled = pd.DataFrame(df_scaled, columns=df.columns, index=df.index)

df_scaled.head()

Unnamed: 0_level_0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
afghanistan,0.111708,0.546926,0.0019,0.085627,0.006066
albania,0.044422,0.679612,0.22853,0.550459,0.388729
algeria,0.019607,0.699029,0.038907,0.429664,0.488027
andorra,0.156945,0.320388,0.322079,1.0,0.825351
angola,0.078202,0.838188,0.057527,0.220183,0.203065


In [11]:
df_scaled.describe()

Unnamed: 0,average_monthly_cost_$,average_yearly_temperature,internet_speed_mbps,safety_index,Healthcare Index
count,155.0,155.0,155.0,155.0,155.0
mean,0.130757,0.649957,0.280175,0.525754,0.554817
std,0.129273,0.221201,0.230921,0.233409,0.216283
min,0.0,0.0,0.0,0.0,0.0
25%,0.053707,0.457929,0.102952,0.346585,0.394157
50%,0.088349,0.705502,0.229611,0.515291,0.56705
75%,0.16768,0.834951,0.420988,0.714832,0.713442
max,1.0,1.0,1.0,1.0,1.0


In [12]:
df_scaled.to_csv("../raw_data/merged_country_level/scaled_merged_data_after_imputation.csv")

In [13]:
import pickle
with open('../models/scaling_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)