In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


df = pd.read_csv("Financial_inclusion_dataset.csv")


df.drop(columns=['year', 'uniqueid'], inplace=True)


df['bank_account'] = df['bank_account'].map({'Yes': 1, 'No': 0})


X = df.drop(columns='bank_account')
y = df['bank_account']


categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include='int64').columns.tolist()


encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
preprocessor = ColumnTransformer([
    ('cat', encoder, categorical_cols)
], remainder='passthrough') 

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


joblib.dump(pipeline, 'financial_inclusion_model.pkl')
joblib.dump(encoder, 'encoder.pkl')
joblib.dump(categorical_cols, 'categorical_columns.pkl')


encoder.fit(X[categorical_cols])
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)
expected_columns = list(encoded_feature_names) + numerical_cols
joblib.dump(expected_columns, 'expected_columns.pkl')


['expected_columns.pkl']