In [66]:
import pandas as pd
from sklearn.preprocessing import TargetEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE

In [67]:
df_train = pd.read_csv("final_proj_data.csv")
df_test = pd.read_csv("final_proj_test.csv")

In [68]:
X_train = df_train.drop(columns=df_train.columns[df_train.isna().sum() > 9999])

In [69]:
y_train = X_train.pop('y')

In [70]:
X_test = df_test[X_train.columns]

In [71]:
train_cat_feats = X_train.select_dtypes(include=['object', 'category'])
train_num_feats = X_train.select_dtypes(include=['float', 'int'])

test_cat_feats = X_test.select_dtypes(include=['object', 'category'])
test_num_feats = X_test.select_dtypes(include=['float', 'int'])

In [72]:
cat_imp = SimpleImputer(strategy='most_frequent')
num_imp = SimpleImputer(strategy='mean')

train_cat_imputed = cat_imp.fit_transform(train_cat_feats)
train_num_imputed = num_imp.fit_transform(train_num_feats)

test_cat_imputed = cat_imp.fit_transform(test_cat_feats)
test_num_imputed = num_imp.fit_transform(test_num_feats)

In [73]:
enc = TargetEncoder(random_state=42)

X_train_cat_encoded = enc.fit_transform(train_cat_imputed, y_train)
X_test_cat_encoded = enc.transform(test_cat_imputed)

In [74]:
X_train_final = pd.concat([pd.DataFrame(X_train_cat_encoded), pd.DataFrame(train_num_imputed)], axis=1)
X_test_final = pd.concat([pd.DataFrame(X_test_cat_encoded), pd.DataFrame(test_num_imputed)], axis=1)

In [75]:
sm = SMOTE(random_state=42)
X_train_final, y_train = sm.fit_resample(X_train_final, y_train)

In [76]:
X_train_final.columns = X_train_final.columns.astype(str)
X_test_final.columns = X_test_final.columns.astype(str)

In [77]:
scaler = StandardScaler().set_output(transform='pandas')

X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

In [78]:
gb = GradientBoostingClassifier(random_state=42, n_estimators=91, max_depth=3)

gb.fit(X_train_scaled, y_train)
y_pred = gb.predict(X_test_scaled)

In [79]:
res = pd.DataFrame({'index': range(len(y_pred)), 'y': y_pred})

In [80]:
res.to_csv('results.csv', index=False)