In [1]:
import pandas as pd
import matplotlib.pyplot as plt  
import numpy as np
from regression import LogReg
from regression import KNN
from ucimlrepo import fetch_ucirepo 

In [2]:
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 

In [3]:
# Cleaned y, replaced odd entries with the correct values
y.replace(['>50K.', '<=50K.'], ['>50K', '<=50K'], inplace=True)
y

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,


Unnamed: 0,income
0,<=50K
1,<=50K
2,<=50K
3,<=50K
4,<=50K
...,...
48837,<=50K
48838,<=50K
48839,<=50K
48840,<=50K


In [4]:
# Converted binary categorical variable to booleans
y.replace(['>50K', '<=50K'], [1,0], inplace=True)
X['sex'].replace(['Male', 'Female'], [1,0], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  method=method,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [5]:
# Combines X and y dataframes to make cleaning both easier
joined = X.join(y)
joined

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,0,0,0,36,United-States,0
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,1,0,0,40,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,1,0,0,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,1,5455,0,40,United-States,0


In [6]:
# removes missing rows from X and y
cleaned = joined.dropna()

In [7]:
# removes duplicate rows from X and y
cleaned = cleaned.drop_duplicates(ignore_index=False)
cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,1,0,0,40,United-States,0
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,0,0,0,36,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,1,0,0,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,1,5455,0,40,United-States,0


In [8]:
# Found and removed unusual entry '?' in workclass
# cleaned.drop_duplicates('workclass')
cleaned = cleaned[cleaned['workclass'] != '?']

In [9]:
# Found and removed unusual entry '?' in occupation
# cleaned.drop_duplicates('occupation')
cleaned = cleaned[cleaned['occupation'] != '?']

In [10]:
# Found and removed unusual entry '?' in native-country
# cleaned.drop_duplicates('native-country')
cleaned = cleaned[cleaned['native-country'] != '?']

In [11]:
# Looking for unusual int values. None detected.
# cleaned[(cleaned['capital-gain'] < 0)]
# cleaned[(cleaned['capital-loss'] < 0)]
# cleaned[(cleaned['hours-per-week'] < 0)]
# cleaned[(cleaned['age'] < 0)]
cleaned

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,Private,245211,Bachelors,13,Never-married,Prof-specialty,Own-child,White,1,0,0,40,United-States,0
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,0,0,0,36,United-States,0
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,1,0,0,50,United-States,0
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,1,5455,0,40,United-States,0


In [12]:
# One-hot-encoded all categorical vars
joined = pd.get_dummies(joined, columns = ['workclass', 'education', 'marital-status', 'occupation',
                             'relationship', 'race', 'native-country'])
joined

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,income,workclass_?,workclass_Federal-gov,...,native-country_Portugal,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia
0,39,77516,13,1,2174,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,1,0,0,13,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,0,40,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,215419,13,0,0,0,36,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48838,64,321403,9,1,0,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48839,38,374983,13,1,0,0,50,0,0,0,...,0,0,0,0,0,0,0,1,0,0
48840,44,83891,13,1,5455,0,40,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
# Separate X and y
y_cleaned = joined['income']
X_cleaned = joined.drop('income', axis='columns')

In [15]:
# Convert to array
y_arr = np.array(y_cleaned)
X_arr = np.array(X)

In [None]:


lr = LogReg(learning_rate=0.1, num_epochs=100)
losses = lr.fit(X, y)

accuracy_LogReg = lr.evaluate_acc(X, y)
print('log regression Accuracy: ', accuracy_LogReg)

In [None]:
knn = KNN(k=4)
accuracy_k_fold = KNN.k_fold_cross_validation(knn, X, y, k=5)
print("5-Fold Cross-Validation Accuracy:", accuracy_k_fold)


In [None]:
plt.figure()
plt.plot(losses)
#plt.plot(accuracy_k_fold)
plt.grid()