# Import necessary libraries

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

#sklearn apis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics



# Read in training and test data

In [4]:
train = pd.read_csv("data/train_final.csv")
test = pd.read_csv("data/test_final.csv")

Rename columns to my own preference

In [5]:
train.rename(columns={'capital.gain': 'capital gain', 'capital.loss': 'capital loss', 
'native.country': 'country','hours.per.week': 'hours per week','marital.status': 'marital',
'education.num': 'education num', 'fnlwgt': 'final weight', 'income>50K': 'income'}, inplace=True)
train.columns

Index(['age', 'workclass', 'final weight', 'education', 'education num',
       'marital', 'occupation', 'relationship', 'race', 'sex', 'capital gain',
       'capital loss', 'hours per week', 'country', 'income'],
      dtype='object')

In [6]:
test.rename(columns={'capital.gain': 'capital gain', 'capital.loss': 'capital loss', 
'native.country': 'country','hours.per.week': 'hours per week','marital.status': 'marital',
'education.num': 'education num', 'fnlwgt': 'final weight'}, inplace=True)
test.columns

Index(['ID', 'age', 'workclass', 'final weight', 'education', 'education num',
       'marital', 'occupation', 'relationship', 'race', 'sex', 'capital gain',
       'capital loss', 'hours per week', 'country'],
      dtype='object')

In [7]:
train.isin(['?']).sum(axis=0)

age                  0
workclass         1437
final weight         0
education            0
education num        0
marital              0
occupation        1442
relationship         0
race                 0
sex                  0
capital gain         0
capital loss         0
hours per week       0
country            427
income               0
dtype: int64

Deal with unknown characters by turning them into NaN and dropping the entry

In [8]:
train['country'] = train['country'].replace({'?': np.nan})
train['workclass'] = train['workclass'].replace({'?': np.nan})
train['occupation'] = train['occupation'].replace({'?': np.nan})

In [9]:
train

Unnamed: 0,age,workclass,final weight,education,education num,marital,occupation,relationship,race,sex,capital gain,capital loss,hours per week,country,income
0,53,Self-emp-not-inc,93449,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,1
1,33,Self-emp-not-inc,123424,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,47,Private,144844,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,40,Private,114580,HS-grad,9,Divorced,Craft-repair,Other-relative,White,Female,0,0,40,Vietnam,0
4,39,Private,115618,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,18,Private,83451,HS-grad,9,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,25,United-States,0
24996,64,Local-gov,202738,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,35,United-States,0
24997,39,Private,225544,Masters,14,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,40,Poland,0
24998,53,Private,346871,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Male,4787,0,46,United-States,1


In [10]:
test['country'] = test['country'].replace('?',np.nan)
test['workclass'] = test['workclass'].replace('?',np.nan)
test['occupation'] = test['occupation'].replace('?',np.nan)

In [11]:
#train.dropna(how='any',inplace=True)

In [12]:
#test.dropna(how='any',inplace=True)

Convert non-numerical data into numerical data

In [13]:
train.workclass.value_counts()

Private             17336
Self-emp-not-inc     1978
Local-gov            1617
State-gov            1037
Self-emp-inc          845
Federal-gov           730
Without-pay            15
Never-worked            5
Name: workclass, dtype: int64

In [14]:
#sex
train['sex'] = train['sex'].map({'Male': 0, 'Female': 1}).astype(int)
#race
train['race'] = train['race'].map({'Black': 0, 'Asian-Pac-Islander': 1,                     
'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4}).astype(int)
#marital
train['marital'] = train['marital'].map({'Married-spouse-absent': 0, 'Widowed': 1, 
'Married-civ-spouse': 2, 'Separated': 3, 'Divorced': 4,'Never-married': 5, 
'Married-AF-spouse': 6}).astype(int)
#workclass
train['workclass'] = train['workclass'].map({'Self-emp-inc': 0, 'State-gov': 1,
'Federal-gov': 2, 'Without-pay': 3, 'Local-gov': 4,'Private': 5, 
'Self-emp-not-inc': 6}).fillna(7).astype(int)
#education
train['education'] = train['education'].map({'Some-college': 0, 'Preschool': 1, 
'5th-6th': 2, 'HS-grad': 3, 'Masters': 4, '12th': 5, '7th-8th': 6, 'Prof-school': 7,
'1st-4th': 8, 'Assoc-acdm': 9, 'Doctorate': 10, '11th': 11,'Bachelors': 12, '10th': 13,
'Assoc-voc': 14,'9th': 15}).astype(int)
#occupation
train['occupation'] = train['occupation'].map({ 'Farming-fishing': 1, 'Tech-support': 2, 
'Adm-clerical': 3, 'Handlers-cleaners': 4, 'Prof-specialty': 5,'Machine-op-inspct': 6, 
'Exec-managerial': 7,'Priv-house-serv': 8,'Craft-repair': 9,'Sales': 10, 
'Transport-moving': 11, 'Armed-Forces': 12, 'Other-service': 13,
'Protective-serv': 14}).fillna(15).astype(int)
#relationship
train['relationship'] = train['relationship'].map({'Not-in-family': 0, 'Wife': 1, 
'Other-relative': 2, 'Unmarried': 3,'Husband': 4,'Own-child': 5}).astype(int)
#country
train['country'] = train['country'].map({'United-States': 0, 'Cambodia': 1, 
'England': 2, 'Puerto-Rico': 3, 'Canada': 4, 'Germany': 5, 'Outlying-US(Guam-USVI-etc)': 6, 
'India': 7, 'Japan': 8, 'Greece': 9, 'South': 10, 'China': 11, 'Cuba': 12, 'Iran': 13, 
'Honduras': 14, 'Philippines': 15, 'Italy': 16, 'Poland': 17, 'Jamaica': 18, 'Vietnam': 19, 
'Mexico': 20, 'Portugal': 21, 'Ireland': 22, 'France': 23, 'Dominican-Republic': 24, 
'Laos': 25, 'Ecuador': 26, 'Taiwan': 27, 'Haiti': 28, 'Columbia': 29, 'Hungary': 30, 
'Guatemala': 31, 'Nicaragua': 32, 'Scotland': 33, 'Thailand': 34, 'Yugoslavia': 35, 
'El-Salvador': 36, 'Trinadad&Tobago': 37, 'Peru': 38, 'Hong': 39, 
'Holand-Netherlands': 40}).fillna(41).astype(int)

In [15]:
#sex
test['sex'] = test['sex'].map({'Male': 0, 'Female': 1}).astype(int)
#race
test['race'] = test['race'].map({'Black': 0, 'Asian-Pac-Islander': 1,                     
'Other': 2, 'White': 3, 'Amer-Indian-Eskimo': 4}).astype(int)
#marital
test['marital'] = test['marital'].map({'Married-spouse-absent': 0, 'Widowed': 1, 
'Married-civ-spouse': 2, 'Separated': 3, 'Divorced': 4,'Never-married': 5, 
'Married-AF-spouse': 6}).astype(int)
#workclass
test['workclass'] = test['workclass'].map({'Self-emp-inc': 0, 'State-gov': 1,
'Federal-gov': 2, 'Without-pay': 3, 'Local-gov': 4,'Private': 5, 
'Self-emp-not-inc': 6}).fillna(7).astype(int)
#education
test['education'] = test['education'].map({'Some-college': 0, 'Preschool': 1, 
'5th-6th': 2, 'HS-grad': 3, 'Masters': 4, '12th': 5, '7th-8th': 6, 'Prof-school': 7,
'1st-4th': 8, 'Assoc-acdm': 9, 'Doctorate': 10, '11th': 11,'Bachelors': 12, '10th': 13,
'Assoc-voc': 14,'9th': 15}).astype(int)
#occupation
test['occupation'] = test['occupation'].map({ 'Farming-fishing': 1, 'Tech-support': 2, 
'Adm-clerical': 3, 'Handlers-cleaners': 4, 'Prof-specialty': 5,'Machine-op-inspct': 6, 
'Exec-managerial': 7,'Priv-house-serv': 8,'Craft-repair': 9,'Sales': 10, 
'Transport-moving': 11, 'Armed-Forces': 12, 'Other-service': 13,
'Protective-serv':14}).fillna(15).astype(int)
#relationship
test['relationship'] = test['relationship'].map({'Not-in-family': 0, 'Wife': 1, 
'Other-relative': 2, 'Unmarried': 3,'Husband': 4,'Own-child': 5}).astype(int)
#country
test['country'] = test['country'].map({'United-States': 0, 'Cambodia': 1, 
'England': 2, 'Puerto-Rico': 3, 'Canada': 4, 'Germany': 5, 'Outlying-US(Guam-USVI-etc)': 6, 
'India': 7, 'Japan': 8, 'Greece': 9, 'South': 10, 'China': 11, 'Cuba': 12, 'Iran': 13, 
'Honduras': 14, 'Philippines': 15, 'Italy': 16, 'Poland': 17, 'Jamaica': 18, 'Vietnam': 19, 
'Mexico': 20, 'Portugal': 21, 'Ireland': 22, 'France': 23, 'Dominican-Republic': 24, 
'Laos': 25, 'Ecuador': 26, 'Taiwan': 27, 'Haiti': 28, 'Columbia': 29, 'Hungary': 30, 
'Guatemala': 31, 'Nicaragua': 32, 'Scotland': 33, 'Thailand': 34, 'Yugoslavia': 35, 
'El-Salvador': 36, 'Trinadad&Tobago': 37, 'Peru': 38, 'Hong': 39, 
'Holand-Netherlands': 40}).fillna(41).astype(int)

use sklearn logistic regression to fit model

In [16]:
logreg = LogisticRegression()

In [17]:
for col in train.columns:
    print(train[col].value_counts())

36    741
33    695
35    683
31    679
23    672
     ... 
83      5
82      5
87      3
85      2
89      1
Name: age, Length: 73, dtype: int64
5    17336
6     1978
4     1617
7     1442
1     1037
0      845
2      730
3       15
Name: workclass, dtype: int64
111567    12
203488    11
161141    10
246891    10
126569    10
          ..
95984      1
96705      1
86958      1
61989      1
202738     1
Name: final weight, Length: 17870, dtype: int64
3     8119
0     5571
12    4044
4     1369
14    1055
11     935
9      812
13     689
6      491
7      450
15     385
5      333
10     307
2      260
8      136
1       44
Name: education, dtype: int64
9     8119
10    5571
13    4044
14    1369
11    1055
7      935
12     812
6      689
4      491
15     450
5      385
8      333
16     307
3      260
2      136
1       44
Name: education num, dtype: int64
2    11443
5     8226
4     3412
1      837
3      729
0      326
6       27
Name: marital, dtype: int64
5     3204
7     3172
9 

In [18]:
original_x = pd.DataFrame(np.c_[train['relationship'], train['education'], 
train['race'],train['occupation'],train['sex'],train['marital'],train['workclass']], 
columns = ['relationship','education','race','occupation','sex','marital','workclass'])
original_y = pd.DataFrame(train.income)

In [19]:
train_x, test_x, train_y, test_y = train_test_split(original_x, original_y, test_size=0.3, random_state=1)

In [20]:
finaltest_x = pd.DataFrame(np.c_[test['relationship'], test['education'], 
test['race'],test['occupation'],test['sex'],test['marital'],test['workclass']], 
columns = ['relationship','education','race','occupation','sex','marital','workclass'])


In [21]:
original_x

Unnamed: 0,relationship,education,race,occupation,sex,marital,workclass
0,4,7,1,5,0,2,6
1,4,12,3,7,0,2,6
2,4,3,3,9,0,2,5
3,2,3,3,9,1,4,5
4,4,3,3,11,0,2,5
...,...,...,...,...,...,...,...
24995,0,3,3,3,1,5,5
24996,1,3,3,3,1,2,4
24997,4,4,3,11,0,2,5
24998,0,3,3,5,0,4,5


In [22]:

np.squeeze(original_y.transpose().to_numpy())

array([1, 1, 0, ..., 0, 1, 0], dtype=int64)

In [23]:
logreg.fit(original_x, np.squeeze(original_y.transpose().to_numpy()))

In [24]:
clf = LogisticRegression().fit(original_x, np.squeeze(original_y.transpose().to_numpy()))

In [25]:
y_pred = clf.predict_proba(finaltest_x)

In [26]:
y_pred = y_pred[:,1]

In [27]:
put_in_file = y_pred[:,1]

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed

In [None]:
metrics.roc_auc_score(np.squeeze(original_y.transpose().to_numpy()), y_pred, average=None)

0.7815107091161875

In [None]:
import os
if os.path.exists("prediction.csv"):
    os.remove("prediction.csv")

f = open("prediction.csv", "a")
f.write("ID,Prediction\n")

for i in range(23842):
    f.write(str(i+1)+","+str(put_in_file[i])+"\n")

f.close()