In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/adult-income-classification-challenge-map-6990-mi/sample_submission.csv
/kaggle/input/adult-income-classification-challenge-map-6990-mi/train.csv
/kaggle/input/adult-income-classification-challenge-map-6990-mi/test.csv


Pandas and numpy are imported for data analysis and the train test split is imported to create training and testing sets. Accuracy score is imported to measure the accuracy of each model.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



The training and test data sets are retrieved from the competition input data using pandas

In [3]:
train_data = pd.read_csv('/kaggle/input/adult-income-classification-challenge-map-6990-mi/train.csv')
test_data = pd.read_csv('/kaggle/input/adult-income-classification-challenge-map-6990-mi/test.csv')

train_data.head() is being used to examine the columns for useless data and categorical data

In [4]:
train_data.head()

Unnamed: 0,ID,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,34496,37,Private,193106,Bachelors,13,Never-married,Sales,Not-in-family,White,Female,0,0,30,United-States,0
1,18592,56,Self-emp-inc,216636,12th,8,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1651,40,United-States,0
2,12563,53,Private,126977,HS-grad,9,Separated,Craft-repair,Not-in-family,White,Male,0,0,35,United-States,0
3,553,72,Private,205343,11th,7,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
4,3480,46,State-gov,106705,Masters,14,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,38,United-States,0


The ID column is removed from X because it is not needed and Income is removed from X and assigned to Y because it is the target variable.

In [5]:
X= train_data.drop(['ID', 'Income'], axis=1)
Y= train_data['Income']

In this next code cell I am checking for any null values in the data set.

In [6]:
X.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

In this cell I am using label encoder to account for all the categorical variables in this data set.

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['workclass'] = le.fit_transform(X['workclass'].values)
X['education'] = le.fit_transform(X['education'].values)
X['marital-status'] = le.fit_transform(X['marital-status'].values)
X['occupation'] = le.fit_transform(X['occupation'].values)
X['relationship'] = le.fit_transform(X['relationship'].values)
X['race'] = le.fit_transform(X['race'].values)
X['sex'] = le.fit_transform(X['sex'].values)
X['native-country'] = le.fit_transform(X['native-country'].values)
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,37,4,193106,9,13,4,12,1,4,0,0,0,30,39
1,56,5,216636,2,8,2,4,0,4,1,0,1651,40,39
2,53,4,126977,11,9,5,3,1,4,1,0,0,35,39
3,72,4,205343,1,7,6,1,4,4,0,0,0,40,39
4,46,7,106705,12,14,4,4,1,4,0,0,0,38,39


A train test split is being made to train the various machine learning models. A test size of 20% is being used.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

In this cell I am preprocessing the data by scaling it. 

In [9]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
sc.fit(X_test)
X_test_std = sc.transform(X_test)

In this cell I am implementing PCA to reduce the number of components down to 2 to see if it makes the models more accurate.

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_std)
X_test_pca = pca.transform(X_test_std)

I am importing all the models in this cell

In [11]:
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
knn= KNeighborsClassifier(n_neighbors=5, 
                           p=2, 
                           metric='minkowski')
dt = DecisionTreeClassifier(criterion='gini', 
                                    max_depth=4, 
                                    random_state=1)
svm = SVC(kernel='linear', C=1.0, random_state=1)
svmrbf = svm = SVC(kernel='rbf', random_state=1, gamma=0.1, C=1.0)
randomforestmodel = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
lrmodel = LogisticRegression(C=100.0, solver='liblinear', multi_class='ovr')
ppn = Perceptron(eta0=0.1, random_state=1)

I am fitting all of the models with the scaled PCA X training data and then predicting the target variable.

In [12]:
ppn.fit(X_train_pca, y_train)

In [13]:
svmrbf.fit(X_train_pca, y_train)

In [14]:
knn.fit(X_train_pca, y_train)

In [15]:
svm.fit(X_train_pca, y_train)

In [16]:
dt.fit(X_train_pca, y_train)

In [17]:
lrmodel.fit(X_train_pca, y_train)

In [18]:
randomforestmodel.fit(X_train_pca, y_train)

In [19]:
y_pred_ppn_pca= ppn.predict(X_test_pca)

In [20]:
y_pred_svmrbf_pca= svmrbf.predict(X_test_pca)

In [21]:
y_pred_knn_pca= knn.predict(X_test_pca)

In [22]:
y_pred_dt_pca= dt.predict(X_test_pca)

In [23]:
y_pred_svm_pca=svm.predict(X_test_pca)

In [24]:
y_pred_lr_pca=lrmodel.predict(X_test_pca)

In [25]:
y_pred_rf_pca=randomforestmodel.predict(X_test_pca)

This chunk of code provides the accuracy of each model

In [26]:
accuracyppn_pca= accuracy_score(y_test, y_pred_ppn_pca)
accuracylr_pca= accuracy_score(y_test, y_pred_lr_pca)
accuracyrf_pca= accuracy_score(y_test, y_pred_rf_pca)
accuracysvm_pca= accuracy_score(y_test, y_pred_svm_pca)
accuracysvmrbf_pca= accuracy_score(y_test, y_pred_svmrbf_pca)
accuracydt_pca= accuracy_score(y_test, y_pred_dt_pca)
accuracyknn_pca= accuracy_score(y_test, y_pred_knn_pca)
print("Accuracy of knn pca:",accuracyknn_pca)
print("Accuracy of dt pca:",accuracydt_pca)
print("Accuracy of svm pca:",accuracysvm_pca)
print("Accuracy of svmrbf pca:",accuracysvmrbf_pca)
print("Accuracy of rf pca:",accuracyrf_pca)
print("Accuracy of lr pca:",accuracylr_pca)
print("Accuracy of ppn pca:",accuracyppn_pca)

Accuracy of knn pca: 0.7827255278310941
Accuracy of dt pca: 0.8131797824696098
Accuracy of svm pca: 0.7989763275751759
Accuracy of svmrbf pca: 0.7989763275751759
Accuracy of rf pca: 0.8121561100447857
Accuracy of lr pca: 0.8021753039027512
Accuracy of ppn pca: 0.7175943698016635


Now I am fitting all the models with the scaled X train data without using PCA and predicitng the target.

In [27]:
ppn.fit(X_train_std, y_train)

In [28]:
svmrbf.fit(X_train_std, y_train)

In [29]:
knn.fit(X_train_std, y_train)

In [30]:
svm.fit(X_train_std, y_train)

In [31]:
dt.fit(X_train_std, y_train)

In [32]:
lrmodel.fit(X_train_std, y_train)

In [33]:
randomforestmodel.fit(X_train_std, y_train)

In [34]:
y_pred_ppn= ppn.predict(X_test_std)

In [35]:
y_pred_svmrbf= svmrbf.predict(X_test_std)

In [36]:
y_pred_knn= knn.predict(X_test_std)

In [37]:
y_pred_dt= dt.predict(X_test_std)

In [38]:
y_pred_svm=svm.predict(X_test_std)

In [39]:
y_pred_lr=lrmodel.predict(X_test_std)

In [40]:
y_pred_rf=randomforestmodel.predict(X_test_std)

This chunk of code shows the accuracy of each model without using PCA and it shows that each model is more accurate when not using PCA

In [41]:
accuracyppn= accuracy_score(y_test, y_pred_ppn)
accuracylr= accuracy_score(y_test, y_pred_lr)
accuracyrf= accuracy_score(y_test, y_pred_rf)
accuracydt= accuracy_score(y_test, y_pred_dt)
accuracyknn= accuracy_score(y_test, y_pred_knn)
accuracysvmrbf= accuracy_score(y_test, y_pred_svmrbf)
accuracysvm= accuracy_score(y_test, y_pred_svm)
print("Accuracy of svm:",accuracysvm)
print("Accuracy of svmrbf:",accuracysvmrbf)
print("Accuracy of knn:",accuracyknn)
print("Accuracy of dt:",accuracydt)
print("Accuracy of rf:",accuracyrf)
print("Accuracy of lr:",accuracylr)
print("Accuracy of ppn:",accuracyppn)

Accuracy of svm: 0.8536148432501599
Accuracy of svmrbf: 0.8536148432501599
Accuracy of knn: 0.8259756877799105
Accuracy of dt: 0.8477287268074216
Accuracy of rf: 0.8507997440818938
Accuracy of lr: 0.8280230326295586
Accuracy of ppn: 0.7705694177863084


SVM with the RBF kernel is used due to having the highest accuracy and its capability of being used for non-linear data. Perceptron, logistic regression, and svm with the linear kernel have a weakness against non-linearly separable data which is why they were not chosen. Although randomforest, decision trees, and KNN are good for non-linearly separable data they were also not chosen due to having lower accuracy scores. 

TESTING THE MODEL

Here the test data is being retrieved from the competition input data using pandas

In [42]:
test_data = pd.read_csv('/kaggle/input/adult-income-classification-challenge-map-6990-mi/test.csv')

I am once again checking for any unusable data

In [43]:
test_data.head()

Unnamed: 0,ID,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,40422,30,Private,378009,HS-grad,9,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States
1,47739,54,State-gov,55861,Assoc-acdm,12,Divorced,Adm-clerical,Not-in-family,White,Male,0,0,39,United-States
2,519,21,?,204226,Some-college,10,Never-married,?,Unmarried,White,Female,0,0,35,United-States
3,8565,35,Private,306678,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,2885,0,40,United-States
4,31356,42,Local-gov,121012,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States


Once again the ID column is being dropped since it is not a feature

In [44]:
X_test_data= test_data.drop(['ID'], axis=1)

I am using the following code to check for null values again

In [45]:
X_test_data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
dtype: int64

I am once again using the label encooder to take care of the categorical data bygiving them unique labels.

In [46]:
X_test_data['workclass'] = le.fit_transform(X_test_data['workclass'].values)
X_test_data['education'] = le.fit_transform(X_test_data['education'].values)
X_test_data['marital-status'] = le.fit_transform(X_test_data['marital-status'].values)
X_test_data['occupation'] = le.fit_transform(X_test_data['occupation'].values)
X_test_data['relationship'] = le.fit_transform(X_test_data['relationship'].values)
X_test_data['race'] = le.fit_transform(X_test_data['race'].values)
X_test_data['sex'] = le.fit_transform(X_test_data['sex'].values)
X_test_data['native-country'] = le.fit_transform(X_test_data['native-country'].values)
X_test_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,30,4,378009,11,9,4,7,3,4,1,0,0,40,38
1,54,7,55861,7,12,0,1,1,4,1,0,0,39,38
2,21,0,204226,15,10,4,0,4,4,0,0,0,35,38
3,35,4,306678,11,9,2,7,0,4,1,2885,0,40,38
4,42,2,121012,15,10,2,4,0,4,1,0,0,45,38


I am scaling the testing data once again.

In [47]:
sc.fit(X_test_data)
X_test_data_scaled= sc.transform(X_test_data)

In this cell I am using the svm rbf model to predict the target values using the scaled data

In [48]:
predictionssvmrbf = svmrbf.predict(X_test_data_scaled)

In the next few lines of code I am turning the results into a data frame and saving it as a csv.

In [49]:
submission_datasvmrbf = pd.DataFrame({'ID': test_data['ID'], 'target': predictionssvmrbf})

In [50]:
submission_datasvmrbf.to_csv('submission.csv', index=False)