In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.model_selection import LeaveOneOut

In [2]:
# install category_encoders
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.1-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m81.9/81.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.1


In [10]:
#import category_encoders package to handle encoding of categorical variables as it provides different approaches of transforming categorical features into numerical ones.
import category_encoders as ce

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
#Loading the dataset
df = pd.read_csv('/content/drive/MyDrive/ene_anyogo/CensusDB.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,Female,0,3900,40,United-States,<=50K


In [4]:
#Get more information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   sex             32561 non-null  object
 9   capital-gain    32561 non-null  int64 
 10  capital-loss    32561 non-null  int64 
 11  hours-per-week  32561 non-null  int64 
 12  native-country  32561 non-null  object
 13  income          32561 non-null  object
dtypes: int64(6), object(8)
memory usage: 3.5+ MB


In [5]:
#Check for missing value
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

Result shows that there are no missing values. However when we explored with df.head, the result shows that there are special symbols used to indicate that the information is still missings or not available. The question marks are depicted as missing values and they would now be converted to the appropriate missing values (NaN).

In [13]:
#replacing "?" with NaN
df.replace('?', np.nan, inplace=True)

In [14]:
#Check again to identify missing values
df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64

In [15]:
#checking to identify missing values
columns_with_missing_values = df.columns[df.isnull().any()].tolist()

In [16]:
if not columns_with_missing_values:
    print("No columns with missing values found.")
else:
    # Perform mean imputation for numerical columns
    imputer = SimpleImputer(strategy='most_frequent')
    df[columns_with_missing_values] = imputer.fit_transform(df[columns_with_missing_values])

In [17]:
#Confirm if missing values have been filled
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64

In [18]:
#Preview the dataframe again
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,Female,0,4356,18,United-States,<=50K
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,Female,0,3900,40,United-States,<=50K


Note that the income varaible is categorical and will not be suitable for training the model. We will need to convert the labels to ordinal numbers.

In [87]:
#determine unique labels of income
len(df['income'].unique())

2

In [88]:
#Get each label count in the 'income' feature
df.income.value_counts().to_dict()

{'<=50K': 24720, '>50K': 7841}

In [19]:
#Engineer income feature by ordinal number replacement
income_map = {'<=50K': 1,
              '>50K': 2
}
df['income_ordinal'] = df.income.map(income_map)
df.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,income,income_ordinal
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,Female,0,4356,40,United-States,<=50K,1
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,Female,0,4356,18,United-States,<=50K,1
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Female,0,4356,40,United-States,<=50K,1
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,Female,0,3900,40,United-States,<=50K,1
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,Female,0,3900,40,United-States,<=50K,1
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,Female,0,3770,45,United-States,<=50K,1
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,Male,0,3770,40,United-States,<=50K,1
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,Female,0,3683,20,United-States,>50K,2
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,Female,0,3683,40,United-States,<=50K,1
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,Male,0,3004,60,United-States,>50K,2


In [21]:
#Drop income feature
df.drop(['income'], axis = 1, inplace=True)

KeyError: ignored

In [22]:
#View dataframe to see that the income feature has been dropped
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,sex,capital-gain,capital-loss,hours-per-week,native-country,income_ordinal
0,90,Private,77053,HS-grad,9,Widowed,Prof-specialty,Not-in-family,Female,0,4356,40,United-States,1
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,Female,0,4356,18,United-States,1
2,66,Private,186061,Some-college,10,Widowed,Prof-specialty,Unmarried,Female,0,4356,40,United-States,1
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,Female,0,3900,40,United-States,1
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,Female,0,3900,40,United-States,1


In [23]:
# Determine unique labels in each categorical features
def explore_unique_labels(df, columns):
    for column in columns:
        unique_labels = df[column].unique()
        unique_labels_num = len(unique_labels)
        print(f"Unique labels in '{column}': {unique_labels} (Total: {unique_labels_num})")

# Call function to explore unique labels
features_categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'sex', 'native-country']
explore_unique_labels(df, features_categorical)

Unique labels in 'workclass': ['Private' 'State-gov' 'Federal-gov' 'Self-emp-not-inc' 'Self-emp-inc'
 'Local-gov' 'Without-pay' 'Never-worked'] (Total: 8)
Unique labels in 'education': ['HS-grad' 'Some-college' '7th-8th' '10th' 'Doctorate' 'Prof-school'
 'Bachelors' 'Masters' '11th' 'Assoc-acdm' 'Assoc-voc' '1st-4th' '5th-6th'
 '12th' '9th' 'Preschool'] (Total: 16)
Unique labels in 'marital-status': ['Widowed' 'Divorced' 'Separated' 'Never-married' 'Married-civ-spouse'
 'Married-spouse-absent' 'Married-AF-spouse'] (Total: 7)
Unique labels in 'occupation': ['Prof-specialty' 'Exec-managerial' 'Machine-op-inspct' 'Other-service'
 'Adm-clerical' 'Craft-repair' 'Transport-moving' 'Handlers-cleaners'
 'Sales' 'Farming-fishing' 'Tech-support' 'Protective-serv' 'Armed-Forces'
 'Priv-house-serv'] (Total: 14)
Unique labels in 'relationship': ['Not-in-family' 'Unmarried' 'Own-child' 'Other-relative' 'Husband' 'Wife'] (Total: 6)
Unique labels in 'sex': ['Female' 'Male'] (Total: 2)
Unique labels in

In [25]:
# Since we have categorical features with large number of unique labels, we will combine a couple of encoding methods in other to balance performance and reduce dimensionality

def features_categorical_encoded(DataFrame):
    # Columns that require One-Hot encoding
    features_ohe = ['sex', 'relationship']

    # Column that require Binary encoding
    features_binary = ['native-country']

    # Frequency Encoding
    features_frequency = ['workclass', 'education', 'marital-status', 'occupation']

    # Perform One-Hot Encoding
    df_encoded = pd.get_dummies(DataFrame, columns=features_ohe)

    # Frequency Encoding
    for feature in features_frequency:
        frequency_encoding = df[feature].value_counts(normalize=True).to_dict()
        df_encoded[feature] = df_encoded[feature].map(frequency_encoding)

    # Perform Binary Encoding
    encoder = ce.BinaryEncoder(cols=features_binary)
    df_encoded = encoder.fit_transform(df_encoded)

    return df_encoded

# Call function to encode the desired categorical features
df_encoded = features_categorical_encoded(df)
df_encoded.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,capital-gain,capital-loss,hours-per-week,...,native-country_5,income_ordinal,sex_Female,sex_Male,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
0,90,0.753417,77053,0.322502,9,0.030497,0.183747,0,4356,40,...,1,1,1,0,0,1,0,0,0,0
1,82,0.753417,132870,0.322502,9,0.030497,0.124873,0,4356,18,...,1,1,1,0,0,1,0,0,0,0
2,66,0.753417,186061,0.223918,10,0.030497,0.183747,0,4356,40,...,1,1,1,0,0,0,0,0,1,0
3,54,0.753417,140359,0.01984,4,0.136452,0.061485,0,3900,40,...,1,1,1,0,0,0,0,0,1,0
4,41,0.753417,264663,0.223918,10,0.031479,0.183747,0,3900,40,...,1,1,1,0,0,0,0,1,0,0


In [26]:
df_encoded.isnull().sum()

age                            0
workclass                      0
fnlwgt                         0
education                      0
education-num                  0
marital-status                 0
occupation                     0
capital-gain                   0
capital-loss                   0
hours-per-week                 0
native-country_0               0
native-country_1               0
native-country_2               0
native-country_3               0
native-country_4               0
native-country_5               0
income_ordinal                 0
sex_Female                     0
sex_Male                       0
relationship_Husband           0
relationship_Not-in-family     0
relationship_Other-relative    0
relationship_Own-child         0
relationship_Unmarried         0
relationship_Wife              0
dtype: int64

**(c) Investigate and train at least 5 ML models including Classification (to predict if an individual going to earn more $50,000 annually or not), Clustering and Neural Networks. You are free to choose any ML algorithms.**

In [27]:
# Let's calculate the correlation coefficients
correlation_matrix = df_encoded.corr()
correlation_matrix

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,capital-gain,capital-loss,hours-per-week,...,native-country_5,income_ordinal,sex_Female,sex_Male,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife
age,1.0,-0.187817,-0.076646,-0.050673,0.036527,-0.017063,0.076155,0.077674,0.057775,0.068756,...,0.021781,0.234037,-0.088832,0.088832,0.316875,-0.010106,-0.070002,-0.432918,0.043188,0.020854
workclass,-0.187817,1.0,0.043933,0.056537,-0.169937,-0.065083,-0.043853,-0.050078,-0.037901,-0.112159,...,-0.048655,-0.127538,0.071115,-0.071115,-0.126785,0.030274,0.040993,0.114964,0.012676,-0.016524
fnlwgt,-0.076646,0.043933,1.0,-0.011552,-0.043195,-0.007156,-0.019137,0.000432,-0.010252,-0.018768,...,-0.089858,-0.009463,-0.026858,0.026858,-0.02114,0.007503,0.025518,0.013818,0.0044,-0.016897
education,-0.050673,0.056537,-0.011552,1.0,-0.110061,-0.034691,-0.155822,-0.062959,-0.032917,-0.003266,...,0.101528,-0.11526,0.020067,-0.020067,-0.027139,-0.010125,0.008116,0.023704,0.027738,-0.003628
education-num,0.036527,-0.169937,-0.043195,-0.110061,1.0,0.09772,0.33748,0.12263,0.079923,0.148123,...,0.146158,0.335154,-0.01228,0.01228,0.078848,0.05343,-0.088631,-0.099213,-0.058637,0.033427
marital-status,-0.017063,-0.065083,-0.007156,-0.034691,0.09772,1.0,-0.005031,0.060146,0.056699,0.134336,...,-0.007807,0.328731,-0.429173,0.429173,0.704249,-0.457098,-0.073394,-0.095679,-0.45266,0.186796
occupation,0.076155,-0.043853,-0.019137,-0.155822,0.33748,-0.005031,1.0,0.071873,0.040438,-0.039176,...,0.047632,0.150602,0.112916,-0.112916,-0.01964,0.032406,-0.039924,-0.034895,0.002309,0.066703
capital-gain,0.077674,-0.050078,0.000432,-0.062959,0.12263,0.060146,0.071873,1.0,-0.031615,0.078409,...,0.014713,0.223329,-0.04848,0.04848,0.080175,-0.026489,-0.019046,-0.053601,-0.029004,0.01773
capital-loss,0.057775,-0.037901,-0.010252,-0.032917,0.079923,0.056699,0.040438,-0.031615,1.0,0.054256,...,0.016154,0.150526,-0.045567,0.045567,0.075491,-0.017299,-0.015734,-0.050922,-0.039136,0.017138
hours-per-week,0.068756,-0.112159,-0.018768,-0.003266,0.148123,0.134336,-0.039176,0.078409,0.054256,1.0,...,0.008889,0.229689,-0.229309,0.229309,0.246164,0.007498,-0.04898,-0.249254,-0.037182,-0.06514


##Lets train a Classification model, the Logistics Regression##

In [28]:
#Import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [31]:
# Let's define the feature matrix X and zthe target vector y
X = df_encoded.drop(columns=['income_ordinal'])
y = df_encoded['income_ordinal']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Let's train the model
LR = LogisticRegression()
LR.fit(X_train, y_train)

# Let's make predictions on the test set
y_pred_LR = LR.predict(X_test)

# Let's evaluate the model
accuracy_LR = accuracy_score(y_test, y_pred_LR)
print("Logistic Regression Accuracy:", accuracy_LR)

Logistic Regression Accuracy: 0.7985567326884692


##Lets train a Classification model, the Random Forest##

---



In [33]:
# Import dependencies
from sklearn.ensemble import RandomForestClassifier

# Let's train the model
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train, y_train)

# Make predictions on the test set
y_pred_RF = RF.predict(X_test)

# Evaluate the model
accuracy_RF = accuracy_score(y_test, y_pred_RF)
print("Random Forest Accuracy:", accuracy_RF)


Random Forest Accuracy: 0.852141870105942


##Lets train a Classification model, the K-Nearest Neighbors##

In [37]:
# Import dependencies
from sklearn.neighbors import KNeighborsClassifier

# Let's train the model
KNN = KNeighborsClassifier(n_neighbors=3)  # You can choose any value of 'k'
KNN.fit(X_train, y_train)

# Let's make predictions on the test set
y_pred_KNN = KNN.predict(X_test)

# Let's evaluate the model
accuracy_KNN = accuracy_score(y_test, y_pred_KNN)
print("K-Nearest Neighbors Accuracy:", accuracy_KNN)


K-Nearest Neighbors Accuracy: 0.7623215108245048


##Let's train a Classification model, the Neural Network (Multi-Layer Perceptron)##

In [41]:
#Import dependencies
import keras
from keras.models import Sequential
from keras.layers import Dense

# Number of features
n_features = X.shape[1]

Epoch = 20

# Let's define the neural network model
NN_model = Sequential()
NN_model.add(Dense(64, input_dim=n_features, activation='relu'))
NN_model.add(Dense(32, activation='relu'))
NN_model.add(Dense(1, activation='sigmoid'))

# Let's compile the model
NN_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Let's set the number of epochs to 20
selected_num_epochs = 20

# Let's train the model
NN_model.fit(X_train, y_train, epochs=selected_num_epochs, batch_size=32, validation_split=0.1)

# Let's evaluate the model on the test set
_, accuracy_NN = NN_model.evaluate(X_test, y_test)
print("Neural Network Accuracy:", accuracy_NN)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Neural Network Accuracy: 0.7640104293823242


##Lets train a Clustering model, the K-Means Clustering##

In [36]:
# Import dependencies
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Let's create 5 clusters
n_clusters = 5
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X)

# Let's get cluster assignments for data points
cluster_labels = kmeans.labels_

# Cluster centers (centroids)
centroids = kmeans.cluster_centers_

# Let's evaluate the K-means model
inertia = kmeans.inertia_
silhouette_avg = silhouette_score(X, kmeans.labels_)

# Print the evaluation metrics
print("Inertia (Within-Cluster Sum of Squares):", inertia)
print("Silhouette Score:", silhouette_avg)


Inertia (Within-Cluster Sum of Squares): 39074422866483.95
Silhouette Score: 0.5323565387850812
