In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

In [18]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
train_data.drop(columns=['id'], inplace=True)

X = train_data.iloc[:,:-1]
y = train_data.iloc[:,-1]

train_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,42,technician,married,secondary,no,7,no,no,cellular,25,aug,117,3,-1,0,unknown,0
1,38,blue-collar,married,secondary,no,514,no,no,unknown,18,jun,185,1,-1,0,unknown,0
2,36,blue-collar,married,secondary,no,602,yes,no,unknown,14,may,111,2,-1,0,unknown,0
3,27,student,single,secondary,no,34,yes,no,unknown,28,may,10,2,-1,0,unknown,0
4,26,technician,married,secondary,no,889,yes,no,cellular,3,feb,902,1,-1,0,unknown,1


In [19]:
num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

In [23]:
num_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", num_transformer, num_cols),
        ("cat", cat_transformer, cat_cols)
    ]
)

In [24]:
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier())
])

In [35]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y)
model.fit(X_train, y_train)
print("Validation score:", model.score(X_val, y_val))

Validation score: 0.9344933333333333


In [31]:
pred = model.predict(test_data)
submission = pd.DataFrame({"id": test_data["id"], "target": pred})
submission.to_csv("submission.csv", index=False)