In [1]:
# Import libraries
import pandas as pd
from sklearn.metrics import confusion_matrix, classification_report
import time


In [2]:
# Read data 
filename = 'adult.csv'
df = pd.read_csv(filename)
df.head(5)

Unnamed: 0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
0,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
1,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
2,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
3,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
4,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K


In [3]:
print(df.shape)

(32560, 15)


In [4]:
names = list(df.columns) # get column names
print(names) 

['39', ' State-gov', ' 77516', ' Bachelors', ' 13', ' Never-married', ' Adm-clerical', ' Not-in-family', ' White', ' Male', ' 2174', ' 0', ' 40', ' United-States', ' <=50K']


In [5]:
types = df.dtypes # get column types
print(types)

39                 int64
 State-gov        object
 77516             int64
 Bachelors        object
 13                int64
 Never-married    object
 Adm-clerical     object
 Not-in-family    object
 White            object
 Male             object
 2174              int64
 0                 int64
 40                int64
 United-States    object
 <=50K            object
dtype: object


In [23]:
# Data preprocessing: use label encoder to convert categorical data to numerical data
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
a = ['Sucache','Tuspro','Sony','BooKooMe']
le.fit_transform(a)
b = le.transform(a)
print(b)

[2 3 1 0]


In [7]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
for i in range(len(types)):
  if types[i]=='object':
    le.fit_transform(df[names[i]])
    df[names[i]] = le.transform(df[names[i]])

In [8]:
# convert to numpy
data = df.values
print(data[:2])
X = data[:,:-1]
y = data[:,-1]
print(X.shape)
print(y.shape)

[[    50      6  83311      9     13      2      4      0      4      1
       0      0     13     39      0]
 [    38      4 215646     11      9      0      6      1      4      1
       0      0     40     39      0]]
(32560, 14)
(32560,)


In [9]:
# Scale data into [0,1]
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)


In [10]:
# split into train and test data sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42,
                                                    test_size=0.25)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(24420, 14)
(24420,)
(8140, 14)
(8140,)


K-Neighbors-Classifier

In [11]:
from sklearn.neighbors import KNeighborsClassifier
kNN = KNeighborsClassifier(n_neighbors=3)
kNN.fit(X_train,y_train)


KNeighborsClassifier(n_neighbors=3)

In [12]:
y_pred = kNN.predict(X_test)
print(confusion_matrix(y_test,y_pred)) # Confusion matrix is a table that is often used to describe the performance of a classification model (or "classifier") on a set of test data for which the true values are known.
print(classification_report(y_test,y_pred))


[[5491  666]
 [ 881 1102]]
              precision    recall  f1-score   support

           0       0.86      0.89      0.88      6157
           1       0.62      0.56      0.59      1983

    accuracy                           0.81      8140
   macro avg       0.74      0.72      0.73      8140
weighted avg       0.80      0.81      0.81      8140



In [13]:
# Traing time
start = time.time()
kNN.fit(X_train,y_train)
end = time.time()
print("Training time: ", end-start)

Training time:  0.2234034538269043


In [14]:
# Testing time
start = time.time()
y_pred = kNN.predict(X_test)
end = time.time()
print("Testing time: ", end-start)

Testing time:  1.9398119449615479


Naive Bayes Classification

In [15]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)

GaussianNB()

In [16]:
y_pred = gnb.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[5887  270]
 [1286  697]]
              precision    recall  f1-score   support

           0       0.82      0.96      0.88      6157
           1       0.72      0.35      0.47      1983

    accuracy                           0.81      8140
   macro avg       0.77      0.65      0.68      8140
weighted avg       0.80      0.81      0.78      8140



In [17]:
# Traing time
start = time.time()
gnb.fit(X_train,y_train)
end = time.time()
print("Training time: ", end-start)

Training time:  0.016956090927124023


In [18]:
# Testing time
start = time.time()
y_pred = gnb.predict(X_test)
end = time.time()
print("Testing time: ", end-start)

Testing time:  0.005982875823974609


Decision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)


DecisionTreeClassifier()

In [20]:
y_pred = dt.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[5348  809]
 [ 764 1219]]
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      6157
           1       0.60      0.61      0.61      1983

    accuracy                           0.81      8140
   macro avg       0.74      0.74      0.74      8140
weighted avg       0.81      0.81      0.81      8140



In [21]:
# Traing time
start = time.time()
dt.fit(X_train,y_train)
end = time.time()
print("Training time: ", end-start)

Training time:  0.16456270217895508


In [22]:
# Testing time 
start = time.time()
y_pred = dt.predict(X_test)
end = time.time()
print("Testing time: ", end-start)

Testing time:  0.003026723861694336
