In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report


from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

In [3]:
train_data = pd.read_csv('SalaryData_Train(1).csv')
test_data = pd.read_csv('SalaryData_Test(1).csv')
train_data.sample(5)

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
27554,33,Private,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,1902,45,United-States,>50K
24158,23,Private,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States,<=50K
4776,28,Private,HS-grad,9,Never-married,Other-service,Unmarried,Black,Female,0,0,52,United-States,<=50K
20323,20,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Female,0,0,40,United-States,<=50K
17895,49,Self-emp-not-inc,HS-grad,9,Divorced,Sales,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [4]:
test_data.sample(5)

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
4780,72,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,2329,0,20,United-States,<=50K
3848,32,Self-emp-inc,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
13803,18,Private,11th,7,Never-married,Farming-fishing,Own-child,White,Male,0,0,6,United-States,<=50K
13220,49,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
14824,36,Private,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,60,United-States,<=50K


### EDA & Data Preprocessing

In [5]:
train_data.shape,test_data.shape

((30161, 14), (15060, 14))

In [6]:
# Checking for null values
train_data.isna().sum(),test_data.isna().sum()

(age              0
 workclass        0
 education        0
 educationno      0
 maritalstatus    0
 occupation       0
 relationship     0
 race             0
 sex              0
 capitalgain      0
 capitalloss      0
 hoursperweek     0
 native           0
 Salary           0
 dtype: int64,
 age              0
 workclass        0
 education        0
 educationno      0
 maritalstatus    0
 occupation       0
 relationship     0
 race             0
 sex              0
 capitalgain      0
 capitalloss      0
 hoursperweek     0
 native           0
 Salary           0
 dtype: int64)

In [7]:
train_data.dtypes

age               int64
workclass        object
education        object
educationno       int64
maritalstatus    object
occupation       object
relationship     object
race             object
sex              object
capitalgain       int64
capitalloss       int64
hoursperweek      int64
native           object
Salary           object
dtype: object

In [8]:
# frequency for categorical fields 
category_col =['workclass', 'education','maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native', 'Salary'] 
for c in category_col:
    print (c,'\n',train_data[c].value_counts(),'\n')

workclass 
  Private             22285
 Self-emp-not-inc     2499
 Local-gov            2067
 State-gov            1279
 Self-emp-inc         1074
 Federal-gov           943
 Without-pay            14
Name: workclass, dtype: int64 

education 
  HS-grad         9840
 Some-college    6677
 Bachelors       5044
 Masters         1627
 Assoc-voc       1307
 11th            1048
 Assoc-acdm      1008
 10th             820
 7th-8th          557
 Prof-school      542
 9th              455
 12th             377
 Doctorate        375
 5th-6th          288
 1st-4th          151
 Preschool         45
Name: education, dtype: int64 

maritalstatus 
  Married-civ-spouse       14065
 Never-married             9725
 Divorced                  4214
 Separated                  939
 Widowed                    827
 Married-spouse-absent      370
 Married-AF-spouse           21
Name: maritalstatus, dtype: int64 

occupation 
  Prof-specialty       4038
 Craft-repair         4030
 Exec-managerial      3992
 

In [9]:
# countplot for all categorical columns
# sns.set(rc={'figure.figsize':(15,8)})
# cat_col = ['workclass', 'education','maritalstatus', 'occupation', 'relationship', 'race', 'sex','Salary']
# for col in cat_col:
#     plt.figure()
#     sns.countplot(x = col, data = train_data, palette = 'Set3');

#### Feature Encoding using Label Encoder

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
train_data = train_data.apply(LabelEncoder().fit_transform) # apply - can apply label encoder on all columns at once
train_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,22,5,9,12,4,0,1,4,1,24,0,39,37,0
1,33,4,9,12,2,3,0,4,1,0,0,12,37,0
2,21,2,11,8,0,5,1,4,1,0,0,39,37,0
3,36,2,1,6,2,5,0,2,1,0,0,39,37,0
4,11,2,9,12,2,9,5,2,0,0,0,39,4,0


In [12]:
test_data = test_data.apply(LabelEncoder().fit_transform)
test_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,8,2,1,6,4,6,3,2,1,0,0,39,37,0
1,21,2,11,8,2,4,0,4,1,0,0,49,37,0
2,11,1,7,11,2,10,0,4,1,0,0,39,37,1
3,27,2,15,9,2,6,0,2,1,87,0,39,37,1
4,17,2,0,5,4,7,1,4,1,0,0,29,37,0


### Train - Test Data Split

In [13]:
X = train_data.drop(['Salary'],axis=1)
y = train_data['Salary']

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=20)
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((20207, 13), (9954, 13), (20207,), (9954,))

### Building SVM model

In [15]:
from sklearn import metrics

svc = SVC()
svc.fit(X_train,y_train)
# make predictions
testdata_prediction = svc.predict(X_test)
# Checking the accuracy
print(metrics.classification_report(y_test,testdata_prediction))

              precision    recall  f1-score   support

           0       0.81      0.99      0.89      7512
           1       0.86      0.28      0.42      2442

    accuracy                           0.81      9954
   macro avg       0.83      0.63      0.65      9954
weighted avg       0.82      0.81      0.77      9954



### Testing it on new test data given

In [16]:
X_new = test_data.drop(['Salary'], axis=1)

y_new = test_data['Salary']

In [17]:
# make predictions
testdata_prediction = svc.predict(X_new)
print(metrics.classification_report(y_new,testdata_prediction))

              precision    recall  f1-score   support

           0       0.80      0.99      0.89     11360
           1       0.87      0.26      0.40      3700

    accuracy                           0.81     15060
   macro avg       0.84      0.62      0.64     15060
weighted avg       0.82      0.81      0.77     15060



### Building SVM model with Hyper Parameters kernel='rbf',gamma=15, C=1

In [18]:
model = SVC(kernel='rbf',gamma=15, C=1)

model.fit(X_train, y_train)
# make predictions
prediction = model.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(y_test, prediction))
print(metrics.confusion_matrix(y_test, prediction))

              precision    recall  f1-score   support

           0       0.77      0.98      0.86      7512
           1       0.60      0.08      0.14      2442

    accuracy                           0.76      9954
   macro avg       0.68      0.53      0.50      9954
weighted avg       0.73      0.76      0.68      9954

[[7384  128]
 [2251  191]]


### Testing above model on Test Data

In [23]:
# make predictions
new_prediction = model.predict(X_new)
# summarize the fit of the model
print(metrics.classification_report(y_new, new_prediction))
print(metrics.confusion_matrix(y_new, new_prediction))

              precision    recall  f1-score   support

           0       0.76      0.98      0.86     11360
           1       0.58      0.07      0.13      3700

    accuracy                           0.76     15060
   macro avg       0.67      0.53      0.49     15060
weighted avg       0.72      0.76      0.68     15060

[[11164   196]
 [ 3434   266]]
