In [82]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [83]:
from sklearn.model_selection import GridSearchCV


# Read the CSV files
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Perform SMOTE on the training set
smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

X_train_smote, y_train_smote = X_train, y_train

# Use catgory for categorical features
cat_features = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']

# Train with CatBoost
model = CatBoostClassifier(n_estimators=1000, verbose=100)
# Define the number of folds for cross-validation
k = 5

# model.fit(X_train_smote, y_train_smote, cat_features=cat_features)
# Perform k-fold cross-validation

# Define the parameter grid for grid search
param_grid = {
	'n_estimators': [100, 500, 1000],
	'learning_rate': [0.01, 0.1, 1],
	'depth': [3, 5, 7]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=k, scoring='accuracy', n_jobs=-1)

# Perform grid search on the training data
grid_search.fit(X_train_smote, y_train_smote)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best parameters and the corresponding accuracy
print("Best Parameters:", best_params)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the best model on the test data
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)

scores = cross_val_score(best_model, X_train_smote, y_train_smote, cv=k, scoring='accuracy', n_jobs=-1)

# Print the average accuracy across all folds
print("Average Accuracy:", np.mean(scores))

# Evaluate the model
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)


0:	learn: 0.6184752	total: 8.61ms	remaining: 4.3s
100:	learn: 0.3020846	total: 637ms	remaining: 2.52s
200:	learn: 0.2842442	total: 1.16s	remaining: 1.72s
300:	learn: 0.2755400	total: 1.69s	remaining: 1.12s
400:	learn: 0.2678057	total: 2.28s	remaining: 562ms
499:	learn: 0.2617539	total: 2.85s	remaining: 0us
Best Parameters: {'depth': 3, 'learning_rate': 0.1, 'n_estimators': 500}
Best Accuracy: 0.862854609929078
Accuracy: 0.8649431230610134
Average Accuracy: 0.862854609929078
Accuracy: 0.8649431230610134


In [84]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.89      0.94      0.91      3717
           1       0.76      0.61      0.68      1118

    accuracy                           0.86      4835
   macro avg       0.82      0.78      0.80      4835
weighted avg       0.86      0.86      0.86      4835



In [85]:
feature_weights = best_model.feature_importances_
feature_weights = best_model.feature_importances_

# Get the feature names
feature_names = X_train.columns

# Create a dataframe with feature names and weights
weights_df = pd.DataFrame({'Feature': feature_names, 'Weight': feature_weights})

# Sort the dataframe by weights in descending order
weights_df = weights_df.sort_values(by='Weight', ascending=False)

# Display the sorted feature weights
print(weights_df)


            Feature     Weight
5    marital-status  17.700289
14  capital-netGain  16.085229
0               age  11.946190
7      relationship  11.577715
12   hours-per-week   7.492484
10     capital-gain   7.364648
6        occupation   6.308861
4     education-num   5.815287
3         education   5.058467
2            fnlwgt   3.456635
11     capital-loss   3.185696
1         workclass   1.466416
9               sex   1.219854
8              race   0.761758
13   native-country   0.560470


In [86]:
# Check if both "capital-gain" and "capital-loss" columns are all zero
all_zero = (X_train['capital-gain'] == 0) & (X_train['capital-loss'] == 0)

# Print the result
print(all_zero.value_counts())


True     9911
False    1369
Name: count, dtype: int64
