# CLassification model on Cencus Income Dataset

## problem statement : Prediction task is to determine whether a person makes over 50K a year

# Steps 
# 1.DATA INJECTION 
## -Data profiling
## -bASIC OPERATIONS 
## -Data cleaning
## - Statistical Analysis (analysis of features)

# 2.EDA
## -Univariant analysis
## -Bivariate analysis
## -Multivariate analysis

# 3.Pre-processing
## -Null Value Handling
## -Outliers Handling 
## -Feature Encoding
## -Saving and LOading model from MongoDB
## -Feature Scaling
## --pickling

# 4.Model Building
## -Logistic REgression
## -SVC
## -GridSearchCV

# 5.Evaluation of Model
## -accuracy score
## -roc-auc score
## Confusion matrix

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import scipy.stats as stats
import warnings
warnings.filterwarnings('ignore')

# 1.Data Injection

In [3]:
data=pd.read_csv('adult.csv')

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


# 1.2 Basic Operations 

In [5]:
df=data
df.shape

(32561, 15)

In [7]:
df.columns


Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [8]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
salary            object
dtype: object

In [9]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557    False
32558    False
32559    False
32560    False
Length: 32561, dtype: bool

In [10]:
len(df[df.duplicated()])

24

In [11]:
df.drop_duplicates(inplace=True)

In [13]:
df[df.duplicated()]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary


In [14]:
len(df[df.duplicated()])

0

# To Check the null Values

In [15]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
salary            0
dtype: int64

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32537 non-null  int64 
 1   workclass       32537 non-null  object
 2   fnlwgt          32537 non-null  int64 
 3   education       32537 non-null  object
 4   education-num   32537 non-null  int64 
 5   marital-status  32537 non-null  object
 6   occupation      32537 non-null  object
 7   relationship    32537 non-null  object
 8   race            32537 non-null  object
 9   sex             32537 non-null  object
 10  capital-gain    32537 non-null  int64 
 11  capital-loss    32537 non-null  int64 
 12  hours-per-week  32537 non-null  int64 
 13  native-country  32537 non-null  object
 14  salary          32537 non-null  object
dtypes: int64(6), object(9)
memory usage: 4.0+ MB


# Checking the unique values in each column

In [19]:
for column in df.columns:
    print(f"Feature{column}has {df[column].unique()}unique features\n")

Featureagehas [39 50 38 53 28 37 49 52 31 42 30 23 32 40 34 25 43 54 35 59 56 19 20 45
 22 48 21 24 57 44 41 29 18 47 46 36 79 27 67 33 76 17 55 61 70 64 71 68
 66 51 58 26 60 90 75 65 77 62 63 80 72 74 69 73 81 78 88 82 83 84 85 86
 87]unique features

Featureworkclasshas ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']unique features

Featurefnlwgthas [ 77516  83311 215646 ...  34066  84661 257302]unique features

Featureeducationhas ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']unique features

Featureeducation-numhas [13  9  7 14  5 10 12 11  4 16 15  3  6  2  1  8]unique features

Featuremarital-statushas ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']unique features

Featureoccupationhas ['Adm-clerical' 'Exec-managerial' '

In [21]:
df["native-country"].unique()

array(['United-States', 'Cuba', 'Jamaica', 'India', '?', 'Mexico',
       'South', 'Puerto-Rico', 'Honduras', 'England', 'Canada', 'Germany',
       'Iran', 'Philippines', 'Italy', 'Poland', 'Columbia', 'Cambodia',
       'Thailand', 'Ecuador', 'Laos', 'Taiwan', 'Haiti', 'Portugal',
       'Dominican-Republic', 'El-Salvador', 'France', 'Guatemala',
       'China', 'Japan', 'Yugoslavia', 'Peru',
       'Outlying-US(Guam-USVI-etc)', 'Scotland', 'Trinadad&Tobago',
       'Greece', 'Nicaragua', 'Vietnam', 'Hong', 'Ireland', 'Hungary',
       'Holand-Netherlands'], dtype=object)

In [24]:
df=df.astype({'age':float, 'hours-per-week':float})

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             32537 non-null  float64
 1   workclass       32537 non-null  object 
 2   fnlwgt          32537 non-null  int64  
 3   education       32537 non-null  object 
 4   education-num   32537 non-null  int64  
 5   marital-status  32537 non-null  object 
 6   occupation      32537 non-null  object 
 7   relationship    32537 non-null  object 
 8   race            32537 non-null  object 
 9   sex             32537 non-null  object 
 10  capital-gain    32537 non-null  int64  
 11  capital-loss    32537 non-null  int64  
 12  hours-per-week  32537 non-null  float64
 13  native-country  32537 non-null  object 
 14  salary          32537 non-null  object 
dtypes: float64(2), int64(4), object(9)
memory usage: 4.0+ MB


In [27]:
df.age.min()

17.0

In [28]:
df.age.max()

90.0

In [34]:
df['workclass'].value_counts()

workclass
Private             22673
Self-emp-not-inc     2540
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64

In [35]:
df['fnlwgt'].value_counts()

fnlwgt
164190    13
123011    13
203488    13
148995    12
113364    12
          ..
138342     1
158040     1
232784     1
325573     1
257302     1
Name: count, Length: 21648, dtype: int64

In [36]:
df['marital-status'].value_counts()

marital-status
Married-civ-spouse       14970
Never-married            10667
Divorced                  4441
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64

In [39]:
df.groupby(by='marital-status').count()

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
marital-status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Divorced,4441,4441,4441,4441,4441,4441,4441,4441,4441,4441,4441,4441,4441,4441
Married-AF-spouse,23,23,23,23,23,23,23,23,23,23,23,23,23,23
Married-civ-spouse,14970,14970,14970,14970,14970,14970,14970,14970,14970,14970,14970,14970,14970,14970
Married-spouse-absent,418,418,418,418,418,418,418,418,418,418,418,418,418,418
Never-married,10667,10667,10667,10667,10667,10667,10667,10667,10667,10667,10667,10667,10667,10667
Separated,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025,1025
Widowed,993,993,993,993,993,993,993,993,993,993,993,993,993,993


In [40]:
df.groupby(by='sex').count()

Unnamed: 0_level_0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,capital-gain,capital-loss,hours-per-week,native-country,salary
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
Female,10762,10762,10762,10762,10762,10762,10762,10762,10762,10762,10762,10762,10762,10762
Male,21775,21775,21775,21775,21775,21775,21775,21775,21775,21775,21775,21775,21775,21775


In [43]:
df.salary.unique()

array(['<=50K', '>50K'], dtype=object)

In [44]:
df['salary'].value_counts()

salary
<=50K    24698
>50K      7839
Name: count, dtype: int64

# Categorizing the categorical and numerical features

In [45]:
cat_feature=[feature for feature in df.columns if df[feature].dtype=='object']

In [46]:
print(cat_feature)

['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country', 'salary']


In [49]:
for feature in cat_feature:
    print(df[feature].value_counts())

workclass
Private             22673
Self-emp-not-inc     2540
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
education
HS-grad         10494
Some-college     7282
Bachelors        5353
Masters          1722
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           645
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           332
1st-4th           166
Preschool          50
Name: count, dtype: int64
marital-status
Married-civ-spouse       14970
Never-married            10667
Divorced                  4441
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64
occupation
Prof-specialty       4136
Craft-repair         4094
Exec-managerial      

# Creating function for trimming space from each values in columns and replacing the ? value from each feature

In [53]:
def feture_clean(dataframe,feature):
    for feature in feature:
        dataframe[feature]=dataframe[feature].str.strip()

In [52]:
#W= "   Bhagyashree  "
#str1=W.str.strp()
#print(str1)

In [54]:
df=df.replace('?' ,np.nan)

In [56]:
for feature in cat_feature:
    print(df[feature].value_counts())

workclass
Private             22673
Self-emp-not-inc     2540
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
education
HS-grad         10494
Some-college     7282
Bachelors        5353
Masters          1722
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           645
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           332
1st-4th           166
Preschool          50
Name: count, dtype: int64
marital-status
Married-civ-spouse       14970
Never-married            10667
Divorced                  4441
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64
occupation
Prof-specialty       4136
Craft-repair         4094
Exec-managerial      4065
Adm-clerical         

In [59]:
for feature in cat_feature:
    print(df[feature].unique())

['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' nan
 'Self-emp-inc' 'Without-pay' 'Never-worked']
['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']
['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']
['Adm-clerical' 'Exec-managerial' 'Handlers-cleaners' 'Prof-specialty'
 'Other-service' 'Sales' 'Craft-repair' 'Transport-moving'
 'Farming-fishing' 'Machine-op-inspct' 'Tech-support' nan
 'Protective-serv' 'Armed-Forces' 'Priv-house-serv']
['Not-in-family' 'Husband' 'Wife' 'Own-child' 'Unmarried' 'Other-relative']
['White' 'Black' 'Asian-Pac-Islander' 'Amer-Indian-Eskimo' 'Other']
['Male' 'Female']
['United-States' 'Cuba' 'Jamaica' 'India' nan 'Mexico' 'South'
 'Puerto-Rico' 'Honduras' 'England' 'Canada' 'Germany' 'Iran'
 'Philippines' 'Italy' 'Poland' 'Columbia' 'Cambodia' 'Thailand' '

In [60]:
df.shape

(32537, 15)

# Dropping the nan values 

In [61]:
df.dropna(inplace=True)

In [62]:
df.shape

(30139, 15)

In [63]:
32537-30139

2398

# Numerical Features

In [64]:
num_feature=[feature for feature in df.columns if feature not in cat_feature]
print(num_feature)


['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']


In [66]:
for feature in num_feature:
    print('Feature "{}" has {} no of unique values' .format(feature,df[feature].nunique()))

Feature "age" has 72 no of unique values
Feature "fnlwgt" has 20263 no of unique values
Feature "education-num" has 16 no of unique values
Feature "capital-gain" has 118 no of unique values
Feature "capital-loss" has 90 no of unique values
Feature "hours-per-week" has 94 no of unique values
