In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report


### Dataset Source
Becker, B. & Kohavi, R. (1996). Adult [Dataset]. UCI Machine Learning Repository. https://doi.org/10.24432/C5XW20.

### Code inspired by Kylie Ying from freeCodeCamp.org
Ying, K. (2022, September 26). Machine Learning for Everbody - Full Course. YouTube. https://www.youtube.com/watch?v=i_LwzRVP7bg

In [33]:
cols = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
df = pd.read_csv("adult.data", names=cols, sep=",\s+", engine="python")

### Cleaning the data and making it usable

In [None]:
df = df[df["workclass"] != "?"]
df["income"] = (df["income"] == ">50K").astype(int)
df["workclass"] = ((df["workclass"]).astype("category")).cat.codes
df["marital-status"] = ((df["marital-status"]).astype("category")).cat.codes
df["occupation"] = ((df["occupation"]).astype("category")).cat.codes
df["relationship"] = ((df["relationship"]).astype("category")).cat.codes
df["race"] = ((df["race"]).astype("category")).cat.codes
df["sex"] = (df["sex"] == "Male").astype(int)
df["native-country"] = ((df["native-country"]).astype("category")).cat.codes
df = df.drop(["education"], axis=1)

### Preprocessing

In [None]:
train, val, test = np.split(df.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [36]:
def scale_dataset(dataframe, oversample=False):
  X = dataframe[dataframe.columns[:-1]].values
  y = dataframe[dataframe.columns[-1]].values

  scaler = StandardScaler()
  X = scaler.fit_transform(X)

  if oversample:
    ros = RandomOverSampler()
    X, y = ros.fit_resample(X,y)

  data = np.hstack((X, np.reshape(y, (-1,1))))

  return data, X, y

In [None]:
train, X_train, y_train = scale_dataset(train, oversample=True)
val, X_val, y_val = scale_dataset(val, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

### K Nearest Neighbors (KNN)

In [38]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

In [40]:
y_pred = knn_model.predict(X_test)

In [41]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.86      4654
           1       0.58      0.59      0.58      1491

    accuracy                           0.80      6145
   macro avg       0.72      0.73      0.72      6145
weighted avg       0.80      0.80      0.80      6145



### Naive Bayes

In [42]:
from sklearn.naive_bayes import GaussianNB

In [None]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

In [44]:
y_pred = nb_model.predict(X_test)

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.93      0.88      4654
           1       0.67      0.42      0.52      1491

    accuracy                           0.81      6145
   macro avg       0.75      0.68      0.70      6145
weighted avg       0.79      0.81      0.79      6145



### Logistic Regression

In [46]:
from sklearn.linear_model import LogisticRegression

In [None]:
lgr_model = LogisticRegression()
lgr_model.fit(X_train, y_train)

In [48]:
y_pred = lgr_model.predict(X_test)
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.90      0.77      0.83      4654
           1       0.51      0.75      0.60      1491

    accuracy                           0.76      6145
   macro avg       0.71      0.76      0.72      6145
weighted avg       0.81      0.76      0.78      6145



### Support Vector Machines (SVM)

In [49]:
from sklearn.svm import SVC

In [None]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

In [51]:
y_pred = svm_model.predict(X_test)
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.76      0.85      4654
           1       0.54      0.87      0.67      1491

    accuracy                           0.79      6145
   macro avg       0.74      0.81      0.76      6145
weighted avg       0.85      0.79      0.80      6145



### Neural Network

In [52]:
import tensorflow as tf

In [None]:
nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation="relu", input_shape=(13,)),
    tf.keras.layers.Dense(32, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='binary_crossentropy',
                 metrics=['accuracy'])

In [54]:
history = nn_model.fit(
    X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, verbose=0
)

In [55]:
nn_model.evaluate(X_test, y_test)

[1m193/193[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8075 - loss: 0.4237


[0.4174982011318207, 0.815947949886322]