In [87]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

In [88]:
from sklearn.model_selection import GridSearchCV


# Read the CSV files
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

# Perform SMOTE on the training set
smote = SMOTE()
# X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

X_train_smote, y_train_smote = X_train, y_train

# Use catgory for categorical features
cat_features = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']

# Train with CatBoost
model = CatBoostClassifier(n_estimators=1000, verbose=100)
# Define the number of folds for cross-validation
k = 5

# model.fit(X_train_smote, y_train_smote, cat_features=cat_features)
# Perform k-fold cross-validation

# Define the parameter grid for grid search
param_grid = {
	'n_estimators': [100, 500, 1000],
	'learning_rate': [0.01, 0.1, 1],
	'depth': [3, 5, 7]
}

# Create the grid search object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=k, scoring='accuracy', n_jobs=-1)

# Perform grid search on the training data
grid_search.fit(X_train_smote, y_train_smote)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Print the best parameters and the corresponding accuracy
print("Best Parameters:", best_params)
print("Best Accuracy:", grid_search.best_score_)

# Evaluate the best model on the test data
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)

scores = cross_val_score(best_model, X_train_smote, y_train_smote, cv=k, scoring='accuracy', n_jobs=-1)

# Print the average accuracy across all folds
print("Average Accuracy:", np.mean(scores))

# Evaluate the model
accuracy = best_model.score(X_test, y_test)
print("Accuracy:", accuracy)


0:	learn: 0.6173099	total: 6.87ms	remaining: 3.43s
100:	learn: 0.2803075	total: 636ms	remaining: 2.51s
200:	learn: 0.2549536	total: 1.26s	remaining: 1.87s
300:	learn: 0.2380366	total: 1.92s	remaining: 1.27s
400:	learn: 0.2251270	total: 2.59s	remaining: 640ms
499:	learn: 0.2138871	total: 3.26s	remaining: 0us
Best Parameters: {'depth': 5, 'learning_rate': 0.1, 'n_estimators': 500}
Best Accuracy: 0.8636524822695035
Accuracy: 0.8591520165460186
Average Accuracy: 0.8636524822695035
Accuracy: 0.8591520165460186


In [89]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Generate classification report
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.88      0.94      0.91      3659
           1       0.76      0.62      0.68      1176

    accuracy                           0.86      4835
   macro avg       0.82      0.78      0.80      4835
weighted avg       0.85      0.86      0.85      4835



In [90]:
feature_weights = best_model.feature_importances_
feature_weights = best_model.feature_importances_

# Get the feature names
feature_names = X_train.columns

# Create a dataframe with feature names and weights
weights_df = pd.DataFrame({'Feature': feature_names, 'Weight': feature_weights})

# Sort the dataframe by weights in descending order
weights_df = weights_df.sort_values(by='Weight', ascending=False)

# Display the sorted feature weights
print(weights_df)


            Feature     Weight
7      relationship  15.402209
0               age  15.168480
12   hours-per-week  10.748609
14  capital-netGain   9.360187
6        occupation   7.720032
5    marital-status   7.590900
2            fnlwgt   6.908964
10     capital-gain   5.869671
3         education   5.501945
4     education-num   4.640261
1         workclass   3.516277
11     capital-loss   2.812472
9               sex   1.792474
13   native-country   1.409531
8              race   1.190043
15          is-gain   0.367946
16          is-loss   0.000000


In [91]:
# Check if both "capital-gain" and "capital-loss" columns are all zero
all_zero = (X_train['capital-gain'] == 0) & (X_train['capital-loss'] == 0)

# Print the result
print(all_zero.value_counts())


True     9949
False    1331
Name: count, dtype: int64
