<a id="section-zero"></a>
# Adult Census Income

Adult Census Income data was extracted from the 1994 Census bureau database https://www.kaggle.com/datasets/uciml/adult-census-income The prediction task is to determine whether a person makes over $50K a year.


Table of Contents:

* [Libraries and Initialization](#section-two)
* [Exploratory Data Analysis](#section-three)
* [Imputing, Scaling and Feature Engineering](#section-four)
* [Modeling and Hyperparameter Tuning](#section-five)
* [Conclusion](#section-six)


<a id="section-two"></a>


# Libraries and Initialization

In [None]:
import numpy as np 
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter  
import matplotlib.patches
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support 
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from operator import itemgetter
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
data = pd.read_csv('/kaggle/input/adult-census-income/adult.csv')
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
import warnings
from warnings import filterwarnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

<a href="#section-zero">Start of page</a>

<a id="section-three"></a>
# Exploratory Data Analysis

In [None]:
data.head(20)

In [None]:
data.describe()

## Data Distributions

In [None]:
data_list11 = (data['education.num'].value_counts() / len(data) * 100).sort_values(ascending=False)
data11 = pd.DataFrame(data_list11)
data11['Educationnum'] = data11.index
sns.set(rc={'figure.figsize':(14,10)})
ax11 = sns.barplot(data = data11, x = 'Educationnum', y = 'education.num', palette = 'magma') 
ax11.set(xlabel = 'percentage')
ax11.bar_label(ax11.containers[0], fmt='%.2f%%');

In [None]:
data_list10 = (data['workclass'].value_counts() / len(data) * 100).sort_values(ascending=False)
data10 = pd.DataFrame(data_list10)
data10['Workclass'] = data10.index
sns.set(rc={'figure.figsize':(6,10)})
ax10 = sns.barplot(data = data10, x = 'workclass', y = 'Workclass', palette = 'magma') 
ax10.set(xlabel = 'percentage')
ax10.bar_label(ax10.containers[0], fmt='%.2f%%');

In [None]:
data_list9 = (data['education'].value_counts() / len(data) * 100).sort_values(ascending=False)
data9 = pd.DataFrame(data_list9)
data9['Education'] = data9.index
sns.set(rc={'figure.figsize':(6,10)})
ax9 = sns.barplot(data = data9, x = 'education', y = 'Education', palette = 'magma') 
ax9.set(xlabel = 'percentage')
ax9.bar_label(ax9.containers[0], fmt='%.2f%%');

In [None]:
data_list8 = (data['occupation'].value_counts() / len(data) * 100).sort_values(ascending=False)
data8 = pd.DataFrame(data_list8)
data8['Occupation'] = data8.index
sns.set(rc={'figure.figsize':(6,10)})
ax8 = sns.barplot(data = data8, x = 'occupation', y = 'Occupation', palette = 'magma') 
ax8.set(xlabel = 'percentage')
ax8.bar_label(ax8.containers[0], fmt='%.2f%%');

In [None]:
data_list7 = (data['marital.status'].value_counts() / len(data) * 100).sort_values(ascending=False)
data7 = pd.DataFrame(data_list7)
data7['MaritalStatus'] = data7.index
sns.set(rc={'figure.figsize':(6,6)})
ax7 = sns.barplot(data = data7, x = 'marital.status', y = 'MaritalStatus', palette = 'magma') 
ax7.set(xlabel = 'percentage')
ax7.bar_label(ax7.containers[0], fmt='%.2f%%');

In [None]:
data_list6 = (data['relationship'].value_counts() / len(data) * 100).sort_values(ascending=False)
data6 = pd.DataFrame(data_list6)
data6['Relationship'] = data6.index
sns.set(rc={'figure.figsize':(6,6)})
ax6 = sns.barplot(data = data6, x = 'relationship', y = 'Relationship', palette = 'magma') 
ax6.set(xlabel = 'percentage')
ax6.bar_label(ax6.containers[0], fmt='%.2f%%');

In [None]:
data_list5 = (data['sex'].value_counts() / len(data) * 100).sort_values(ascending=False)
data5 = pd.DataFrame(data_list5)
data5['Sex'] = data5.index
sns.set(rc={'figure.figsize':(4,4)})
ax5 = sns.barplot(data = data5, x = 'sex', y = 'Sex', palette = 'magma') 
ax5.set(xlabel = 'percentage')
ax5.bar_label(ax5.containers[0], fmt='%.2f%%');

In [None]:
data_list4 = (data['native.country'].value_counts() / len(data) * 100).sort_values(ascending=False)
data4 = pd.DataFrame(data_list4)
data4['Country'] = data4.index
sns.set(rc={'figure.figsize':(11.7,8.27)})
ax4 = sns.barplot(data = data4, x = 'native.country', y = 'Country', palette = 'magma') 
ax4.set(xlabel = 'percentage')
ax4.bar_label(ax4.containers[0], fmt='%.3f%%');

In [None]:
data_list1 = (data['race'].value_counts() / len(data) * 100).sort_values(ascending=False)
data2 = pd.DataFrame(data_list1)
data2['Race'] = data2.index
sns.set(rc={'figure.figsize':(5,5)})
ax3 = sns.barplot(data = data2, x = 'race', y = 'Race', color = 'b') 
ax3.set(xlabel = 'percentage')
ax3.bar_label(ax3.containers[0], fmt='%.2f%%');

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
sns.histplot(data.age, kde=True, bins = 10).set(title = 'Age Distribution')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(5, 8)})
sns.histplot(data['capital.loss'], kde=False, bins = 20).set(title = 'Capital Loss')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(5, 8)})
sns.histplot(data['capital.gain'], kde=False, bins = 20).set(title = 'Capital Gain')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(5, 10)})
sns.histplot(data['hours.per.week'], kde=False, bins = 10).set(title = 'Hours Per Week')
plt.show()

## Income Distributions

In [None]:
sns.set(rc={'figure.figsize':(10, 3)})
sns.histplot(data = data, x="race", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(4, 3)})
sns.histplot(data = data, x="sex", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(10, 3)})
sns.histplot(data = data, x="relationship", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(15, 3)})
sns.histplot(data = data, x="marital.status", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(15, 5)})
sns.histplot(data = data, x="education.num", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(20, 5)})
sns.histplot(data = data, x="education", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(15, 5)})
sns.histplot(data = data, x="workclass", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
sns.histplot(data = data, x = 'age', kde=True, hue = 'income', bins = 10).set(title = 'Income Distribution')
plt.show();

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
sns.histplot(data = data, x = 'capital.gain', kde=False, hue = 'income', bins = 10).set(title = 'Income Distribution')
plt.show();

In [None]:
sns.set(rc={'figure.figsize':(15, 25)})
sns.histplot(data = data, y="native.country", hue="income", multiple="fill", stat="proportion", discrete=True, shrink=.4).set(title = 'Income Distribution');

In [None]:
sns.set(rc={'figure.figsize':(10,8)})
sns.histplot(data = data, x = 'capital.loss', kde=False, hue = 'income', bins = 10).set(title = 'Income Distribution')
plt.show();

In [None]:
sns.set(rc={'figure.figsize':(10,10)})
sns.histplot(data = data, x = 'hours.per.week', kde=True, hue = 'income', bins = 10).set(title = 'Income Distribution')
plt.show();

In [None]:
data12_df = (data[['race', 'sex', 'income']].value_counts() / len(data) * 100).to_frame()
data12_df.rename(columns={data12_df.columns[0]: "percentage" }, inplace = True)
data12_df = data12_df.reset_index()
data12_df = data12_df.sort_values(by=['race','sex'])
print(data12_df)
sns.set(rc={'figure.figsize':(6,10)})
g = sns.FacetGrid(data12_df, col="race", height=15, aspect=0.2, hue = 'income')
g.map(sns.barplot, "sex", "percentage", order=["Male", "Female"])
g.add_legend()
for ax in g.axes.ravel():
    for p in ax.patches:
             ax.annotate("%.2f%%" % p.get_height(), (p.get_x() + p.get_width() / 2., p.get_height()),
                 ha='center', va='center', fontsize=11, color='black', xytext=(0, 5),
                 textcoords='offset points')

<a href="#section-zero">Start of page</a>

<a id="section-four"></a>
# Imputing, Scaling and Feature Enginnering

The cells with '?' are replaced with nan for one hot encoder

In [None]:
data_transformed = data.copy()
data_transformed = data_transformed.drop(['workclass', 'fnlwgt', 'education', 'occupation'], axis = 1)
data_transformed = data_transformed.replace('?', np.nan)

In [None]:
data_transformed['native.country'].value_counts(dropna = False)

native.country_nan column created for nan values

In [None]:
data_encoded = pd.get_dummies(data_transformed, columns = ['race','native.country','relationship','marital.status'], dummy_na = True)

In [None]:
data_encoded.head(3)

In [None]:
data_encoded = data_encoded.drop(['marital.status_nan', 'relationship_nan', 'race_nan'], axis = 1)

In [None]:
data_encoded.head(3)

In [None]:
data_encoded['native.country_nan'].value_counts()

All native.country one hot encoded columns are filled with nan for knn imputer:

In [None]:
for col in data_transformed.columns:
    if ('native.country' in col) and (col != 'native.country_nan'):
        missing_col = f'{col}_nan'
        data_encoded.loc[data_encoded[missing_col] == 1, data_encoded.columns.str.startswith(col)] = np.nan

In [None]:
data_encoded['Sex'] = data_encoded['sex'].apply(lambda x: 1 if x == 'Female' else 0)

In [None]:
data_encoded = data_encoded.drop(['sex', 'native.country_nan'], axis = 1)

Data is scaled: [0, 1]

In [None]:
data_encoded['income'] = data_encoded['income'].apply(lambda x: 1 if x == '>50K' else 0)

In [None]:
data_normalized = (data_encoded - data_encoded.min()) / (data_encoded.max() - data_encoded.min())

In [None]:
data_normalized.head(3)

sklearn's KNNImputer

In [None]:
imputer = KNNImputer(n_neighbors = 5, weights = 'distance') 
imputer.fit(data_normalized)

In [None]:
data_imputed = data_normalized.copy()
data_imputed = imputer.transform(data_normalized)

In [None]:
data_transformed2 = data_imputed.copy()
data_transformed2 = pd.DataFrame(data_imputed, columns = data_normalized.columns)

In [None]:
data_transformed2.tail(3)

In [None]:
country_cols = data_transformed2.columns[data_transformed2.columns.str.startswith('native.country')]

In [None]:
print(data_transformed2['native.country_United-States'].value_counts())

Some values lie between 0 and 1

To round these values, index for maximum for each row is stored:

In [None]:
argmax_cols = data_transformed2[country_cols].idxmax(axis = 1)

The maximum value for each row is filled with 1, the rest are filled with 0.

In [None]:
data_transformed3 = data_transformed2.copy()
for col in country_cols:
    for i in range(len(data_transformed3)):
        if (argmax_cols[i] == col):
            data_transformed3.at[i, col] = 1
        else:
            data_transformed3.at[i, col] = 0

In [None]:
data_transformed3.head(3)

In [None]:
data_transformed3['native.country_United-States'].value_counts()

In [None]:
rest_cols = [col for col in data_transformed3.columns if col not in country_cols]

In [None]:
sns.set(font_scale  = 0.8)

Heatmap without country columns:

In [None]:
data_transformed3[rest_cols]

In [None]:
corr_data = data_transformed3[rest_cols].corr()
f21,ax21 = plt.subplots(figsize=(18, 18))
sns.heatmap(corr_data, annot=True, linewidths=.5, fmt= '.3f',ax=ax21)
plt.show()

In [None]:
sns.set(font_scale = 0.5)

In [None]:
data_transformed4 = data_transformed3.copy()
data_transformed4 = data_transformed3[country_cols]
data_transformed4['income'] = data_transformed3['income']
corr_data2 = data_transformed4.corr()
f22,ax22 = plt.subplots(figsize=(18, 18))
sns.heatmap(corr_data2, annot=True, linewidths=.5, fmt= '.2f',ax=ax22)
plt.show()

In [None]:
y = data_transformed3['income']

In [None]:
X = data_transformed3.drop(['income'], axis = 1)

In [None]:
features = X.columns

Mutual information index (MI) is calculated for each feature:

In [None]:
mi_scores = []
for feature in features:
    mi_score =  mutual_info_classif(X[[feature]], y)
    print(feature, mi_score)
    mi_scores.append(mi_score)

In [None]:
features_df = pd.DataFrame(mi_scores, index = features)
features_df.columns = ['mi-index']
features_df.head(40)

In [None]:
features_sorted = features_df.sort_values(by = 'mi-index', ascending = False)
sns.set(rc={'figure.figsize':(12,18)})
ax22 = sns.barplot(data = features_sorted, x = features_sorted['mi-index'], y = features_sorted.index)
for container in ax22.containers:
    ax22.bar_label(container,)
ax22.set(title = 'Mutual Information Index')

<a href="#section-zero">Start of page</a>

<a id="section-five"></a>
# Modeling and Hyperparameter Tuning

Train 70%, validation 15%, test 15% split

In [None]:
X_train_valid, X_test, y_train_valid, y_test = train_test_split(X, y, test_size = 0.15, random_state = 42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train_valid, y_train_valid, test_size = 0.176470588, random_state = 42)

In [None]:
model1 = KNeighborsClassifier(n_neighbors = 270)
model1.fit(X_train, y_train)
y_pred_valid1 = model1.predict(X_valid)
model1_accuracy = np.round(accuracy_score(y_valid, y_pred_valid1), 6)
print('Accuracy Score (k-nearest neighbors):', model1_accuracy)

In [None]:
model2 = LogisticRegression(random_state = 42)
model2.fit(X_train, y_train)
y_pred_valid2 = model2.predict(X_valid)
model2_accuracy = np.round(accuracy_score(y_valid, y_pred_valid2), 6)
print('Accuracy Score (Logistic Regression):', model2_accuracy)

In [None]:
model3 = SVC(kernel = 'poly', random_state = 42)
model3.fit(X_train, y_train)
y_pred_valid3 = model3.predict(X_valid)
model3_accuracy = np.round(accuracy_score(y_valid, y_pred_valid3), 6)
print('Accuracy Score (Support Vector Machine):', model3_accuracy)

In [None]:
model4 = RandomForestClassifier(n_estimators = 2200, random_state = 42)
model4.fit(X_train, y_train)
y_pred_valid4 = model4.predict(X_valid)
model4_accuracy = np.round(accuracy_score(y_valid, y_pred_valid4), 6)
print('Accuracy Score (Random Forest):', model4_accuracy)

In [None]:
model5 = AdaBoostClassifier(n_estimators = 25000, learning_rate = 0.11, random_state = 42)
model5.fit(X_train, y_train)
y_pred_valid5 = model5.predict(X_valid)
model5_accuracy = np.round(accuracy_score(y_valid, y_pred_valid5), 6)
print('Accuracy Score (Ada Boost):', model5_accuracy)

Parameters obtained by several runs of randomized search:

In [None]:
i1 = -2.2835
i2 = 10.616
i3 = 1.1289
i4 = 9.292
model6 = XGBClassifier(learning_rate = 10 ** (i1 / 2), max_depth = int(np.round(i2)), reg_lambda = 10 ** i3, 
                          n_estimators = int(10 ** (i4 / 4)), seed = 42)
model6.fit(X_train, y_train)
y_pred_valid6 = model6.predict(X_valid)
model6_accuracy = np.round(accuracy_score(y_valid, y_pred_valid6), 6)
print('Accuracy Score (XGBoost):', model6_accuracy)

In [None]:
i1 = -11.250666
i2 = 5.635880
i3 = 2.072460
i4 = 4.009552
i5 = 8.380405
i6 = 7.670860
i7 = 8.802374
model7 = LGBMClassifier(learning_rate = 10 ** (i1 / 10), num_iterations = np.int64(10 ** (i2 / 2)), 
                          reg_lambda = np.int64(10 ** (i3 / 2)),
                          n_estimators = np.int64(10 ** i4), max_depth = int(np.round(i5)), num_leaves = np.int64(10 ** (i6 / 4)), 
                          min_child_samples = np.int64(10 ** (i7 / 4)), verbose = -1, random_state = 42)
model7.fit(X_train, y_train)
y_pred_valid7 = model7.predict(X_valid)
model7_accuracy = np.round(accuracy_score(y_valid, y_pred_valid7), 6)
print('Accuracy Score (LightGBM):', model7_accuracy)

<a href="#section-zero">Start of page</a>

<a id="section-six"></a>
# Conclusion

In [None]:
report_df = pd.DataFrame(columns = ['Model', 'Test Accuracy'])

In [None]:
y_pred_test1 = model1.predict(X_test)
model1_test_accuracy = np.round(accuracy_score(y_test, y_pred_test1), 5)
print('Accuracy Score (k-nearest neighbors):', model1_test_accuracy)
row1 = ['k-nearest Neighbors', model1_test_accuracy]
report_df.loc[len(report_df)] = row1

In [None]:
y_pred_test2 = model2.predict(X_test)
model2_test_accuracy = np.round(accuracy_score(y_test, y_pred_test2), 5)
print('Accuracy Score (Logistic Regression):', model2_test_accuracy)
row2 = ['Logistic Regression', model2_test_accuracy]
report_df.loc[len(report_df)] = row2

In [None]:
y_pred_test3 = model3.predict(X_test)
model3_test_accuracy = np.round(accuracy_score(y_test, y_pred_test3), 5)
print('Accuracy Score (Support Vector Machine):', model3_test_accuracy)
row3 = ['Support Vector Machine', model3_test_accuracy]
report_df.loc[len(report_df)] = row3

In [None]:
y_pred_test4 = model4.predict(X_test)
model4_test_accuracy = np.round(accuracy_score(y_test, y_pred_test4), 5)
print('Accuracy Score (Random Forest):', model4_test_accuracy)
row4 = ['Random Forest', model4_test_accuracy]
report_df.loc[len(report_df)] = row4

In [None]:
y_pred_test5 = model5.predict(X_test)
model5_test_accuracy = np.round(accuracy_score(y_test, y_pred_test5), 5)
print('Accuracy Score (Ada Boost):', model5_test_accuracy)
row5 = ['Ada Boost', model5_test_accuracy]
report_df.loc[len(report_df)] = row5

In [None]:
y_pred_test6 = model6.predict(X_test)
model6_test_accuracy = np.round(accuracy_score(y_test, y_pred_test6), 5)
print('Accuracy Score (XGBoost):', model6_test_accuracy)
row6 = ['XGBoost', model6_test_accuracy]
report_df.loc[len(report_df)] = row6

In [None]:
y_pred_test7 = model7.predict(X_test)
model7_test_accuracy = np.round(accuracy_score(y_test, y_pred_test7), 5)
print('Accuracy Score (LightGBM):', model7_test_accuracy)
row7 = ['Light GBM', model7_test_accuracy]
report_df.loc[len(report_df)] = row7

In [None]:
sns.set(rc={'figure.figsize':(3,4)})
table_df2 = report_df.copy()
fig2, ax2 = plt.subplots()
fig2.patch.set_visible(False)
ax2.axis('off')
ax2.axis('tight')
colcolours = ['cyan', 'blue']
table2 = ax2.table(cellText = table_df2.values, colLabels = ['Model', 'Accuracy'], loc='center', colColours = colcolours)
#fig2.tight_layout()
table2.set_fontsize(25)
table2.scale(2, 2)
table2.set
plt.show()

Boosting algorithms have the best test accuracy among all algorithms.

<a href="#section-zero">Start of page</a>

### License

This Notebook has been released under the 
<a href="https://www.apache.org/licenses/LICENSE-2.0">Apache 2.0</a>
open source license.