In [69]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# load data
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [70]:
# NOTE: you can use other techniques to impoute the missing data
train_data = train_data.fillna(0)
test_data = test_data.fillna(0)

In [71]:
# drop columns
train_data_drop = train_data.drop(columns=['Name','Sector'])
test_data_drop = test_data.drop(columns=['Name','Sector'])

In [72]:
# data count
train_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,2498
1,961


In [73]:
# data balance
class_zero = train_data_drop.query('Class == 0').sample(n=961)
class_one = train_data_drop.query('Class == 1')

downbalanced_train = pd.concat([class_zero,class_one], axis=0)

In [74]:
# training data
y = downbalanced_train['Class']
X = downbalanced_train.drop(columns = ['Class'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [84]:
# NOTE: please try different approaches. Here is just an example: decision tree.
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression

xgb_model = xgb.XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    max_depth=10,
    min_child_weight=2,  # min_samples_leaf equivalent in XGBoost
    gamma=0,  # Regularization term to avoid overfitting; adjust if needed
    subsample=1.0,  # Use the entire dataset per tree
    colsample_bytree=1.0,  # Use all features
    n_estimators=200,
    learning_rate=0.1,  # Default; adjust if necessary
)

rf_model = RandomForestClassifier(random_state=42)

stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', xgb_model),
        ('rf', rf_model)
    ],
    final_estimator=LogisticRegression(),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
)


In [85]:
# train the stacking model
stacking_clf.fit(X_train, y_train)
y_pred = stacking_clf.predict(X_test)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [86]:
print(f"F1 Score: {metrics.f1_score(y_test, y_pred)}")

F1 Score: 0.6361185983827493


In [88]:
# generate predictions for tesing data
y_pred = stacking_clf.predict(test_data_drop)
y_pred = pd.DataFrame({'Class': y_pred})
name = test_data['Name']
output = pd.concat([name,y_pred],axis=1)
output.to_csv('sample_submission.csv',index=False)