In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import pickle

In [2]:
datatset = pd.read_csv("../data/merged_data.csv")
datatset


Unnamed: 0,ID,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,NAME_INCOME_TYPE,NAME_EDUCATION_TYPE,NAME_FAMILY_STATUS,NAME_HOUSING_TYPE,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,FLAG_WORK_PHONE,FLAG_PHONE,FLAG_EMAIL,OCCUPATION_TYPE,CNT_FAM_MEMBERS,APPROVED
0,5008804,M,Y,Y,0,427500.0,Working,Higher education,Civil marriage,Rented apartment,-12005,-4542,1,1,0,0,Other,2.0,1
1,5008806,M,Y,Y,0,112500.0,Working,Secondary / secondary special,Married,House / apartment,-21474,-1134,1,0,0,0,Security staff,2.0,1
2,5008808,F,N,Y,0,270000.0,Commercial associate,Secondary / secondary special,Single / not married,House / apartment,-19110,-3051,1,0,1,1,Sales staff,1.0,1
3,5008815,M,Y,Y,0,270000.0,Working,Higher education,Married,House / apartment,-16872,-769,1,1,1,1,Accountants,2.0,0
4,5008825,F,Y,N,0,130500.0,Working,Incomplete higher,Married,House / apartment,-10669,-1103,1,0,0,0,Accountants,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5918,5148694,F,N,N,0,180000.0,Pensioner,Secondary / secondary special,Civil marriage,Municipal apartment,-20600,-198,1,0,0,0,Laborers,2.0,0
5919,5149055,F,N,Y,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15837,-2694,1,1,1,0,Other,2.0,1
5920,5149729,M,Y,Y,0,90000.0,Working,Secondary / secondary special,Married,House / apartment,-19101,-1721,1,0,0,0,Other,2.0,0
5921,5149838,F,N,Y,0,157500.0,Pensioner,Higher education,Married,House / apartment,-12387,-1325,1,0,1,1,Medicine staff,2.0,1


In [3]:
datatset = datatset.drop("ID", axis=1)


In [4]:
train_set, test_set = train_test_split(
    datatset, test_size=0.25, random_state=42
)


In [5]:
scaler = StandardScaler()
ordinal_encoder = OrdinalEncoder()

In [6]:
onehot_columns = [
    "CODE_GENDER",
    "FLAG_OWN_CAR",
    "FLAG_OWN_REALTY",
    "NAME_INCOME_TYPE",
    "NAME_EDUCATION_TYPE",
    "NAME_FAMILY_STATUS",
    "NAME_HOUSING_TYPE",
    "OCCUPATION_TYPE",
    "FLAG_MOBIL",
    "FLAG_WORK_PHONE",
    "FLAG_PHONE",
    "FLAG_EMAIL",
]

In [7]:
scaler_columns = [
    "AMT_INCOME_TOTAL",
    "DAYS_EMPLOYED",
    "DAYS_BIRTH",
    "CNT_FAM_MEMBERS",
    "CNT_CHILDREN",
]

In [13]:
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
onehot_encoder.fit(train_set[onehot_columns])



In [14]:
train_set_encoded_first = onehot_encoder.transform(train_set[onehot_columns])
test_set_encoded_first = onehot_encoder.transform(test_set[onehot_columns])

In [15]:
# Get the feature names
feature_names = onehot_encoder.get_feature_names_out()

In [16]:
train_set_encoded_first

array([[1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 1., 1., 0.]])

In [17]:
train_encoded_df = pd.DataFrame(train_set_encoded_first, columns=feature_names)
test_encoded_df = pd.DataFrame(test_set_encoded_first, columns=feature_names)

In [18]:
train_encoded = pd.concat([train_encoded_df, train_set[scaler_columns]], axis=1)
test_encoded = pd.concat([test_encoded_df, test_set[scaler_columns]], axis=1)

In [19]:
train_encoded[scaler_columns] = scaler.fit_transform(
    train_encoded[scaler_columns]
)

In [20]:
test_encoded[scaler_columns] = scaler.fit_transform(
    test_encoded[scaler_columns]
)

In [21]:
train_encoded

Unnamed: 0,CODE_GENDER_F,CODE_GENDER_M,FLAG_OWN_CAR_N,FLAG_OWN_CAR_Y,FLAG_OWN_REALTY_N,FLAG_OWN_REALTY_Y,NAME_INCOME_TYPE_Commercial associate,NAME_INCOME_TYPE_Pensioner,NAME_INCOME_TYPE_State servant,NAME_INCOME_TYPE_Student,...,FLAG_WORK_PHONE_1,FLAG_PHONE_0,FLAG_PHONE_1,FLAG_EMAIL_0,FLAG_EMAIL_1,AMT_INCOME_TOTAL,DAYS_EMPLOYED,DAYS_BIRTH,CNT_FAM_MEMBERS,CNT_CHILDREN
0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,1.0,1.0,0.0,2.349393,-0.911660,0.782358,-0.266453,-0.592063
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,-0.743880,0.584542,-1.886160,-0.266453,-0.592063
2,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.802756,-0.257072,-1.219946,-1.261899,-0.592063
3,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.802756,0.744786,-0.589241,-0.266453,-0.592063
4,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,-0.567121,0.598151,1.158865,-0.266453,-0.592063
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5334,,,,,,,,,,,...,,,,,,1.465601,0.633713,0.085145,-0.266453,-0.592063
5734,,,,,,,,,,,...,,,,,,-0.301984,-1.195710,-0.017155,1.724438,1.756100
5191,,,,,,,,,,,...,,,,,,-0.522932,0.983177,0.665968,1.724438,1.756100
5226,,,,,,,,,,,...,,,,,,-0.743880,0.473907,-0.601078,-1.261899,-0.592063


In [None]:
classifier = RandomForestClassifier(criterion="entropy")
classifier.fit(
    train_encoded.drop(columns=["APPROVED"], axis=1),
    train_encoded[["APPROVED"]],
)

KeyError: "['APPROVED'] not found in axis"

In [None]:
accuracy_score(
    test_encoded[["APPROVED"]],
    classifier.predict(test_encoded[feature_names]),
)

KeyError: "['NAME_INCOME_TYPE_Student', 'NAME_EDUCATION_TYPE_Academic degree'] not in index"

In [None]:
# filename = "../output_model/model.pkl"
# pickle.dump(classifier, open(filename, "wb"))
