In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
data = pd.read_csv("incomeData.csv")
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [7]:
data.shape

(32561, 15)

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  Income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 2.6+ MB


## first of all let's handle missing data

In [9]:
data.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
Income            0
dtype: int64

In [10]:
data['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

## we might have many '?' values in data now we need to handle it

In [11]:
data.isin([' ?']).sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
Income               0
dtype: int64

## remove all rows in which '?' value is there

In [12]:
data = data.drop(data[data['workclass'] == ' ?'].index)
data = data.drop(data[data['occupation'] == ' ?'].index)
data = data.drop(data[data['native-country'] == ' ?'].index)
data.shape

(30162, 15)

## now reindexing

In [13]:
data.reset_index(drop=True, inplace=True)

In [14]:
data.tail()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Income
30157,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
30158,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
30159,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
30160,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
30161,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


## handling nominal categorical data columns

In [15]:
data = pd.get_dummies(data, columns=['workclass', 'marital-status', 'occupation', 'relationship', 'sex'], drop_first=True)
data.head()

Unnamed: 0,age,fnlwgt,education,education-num,race,capital-gain,capital-loss,hours-per-week,native-country,Income,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male
0,39,77516,Bachelors,13,White,2174,0,40,United-States,<=50K,...,0,0,0,0,1,0,0,0,0,1
1,50,83311,Bachelors,13,White,0,0,13,United-States,<=50K,...,0,0,0,0,0,0,0,0,0,1
2,38,215646,HS-grad,9,White,0,0,40,United-States,<=50K,...,0,0,0,0,1,0,0,0,0,1
3,53,234721,11th,7,Black,0,0,40,United-States,<=50K,...,0,0,0,0,0,0,0,0,0,1
4,28,338409,Bachelors,13,Black,0,0,40,Cuba,<=50K,...,0,0,0,0,0,0,0,0,1,0


## education columns we can consider it is ordinal categoricald data so we can handle it that way

In [16]:
data['education'] = data['education'].map({' Preschool':1, ' 1st-4th':2, ' 5th-6th':3, ' 7th-8th':4, 
                               ' 9th':5, ' 10th':6, ' 11th':7, ' 12th':8, ' HS-grad':9,
                               ' Prof-school':10, ' Assoc-acdm':11, ' Assoc-voc':12, ' Some-college':13,
                               ' Bachelors':14, ' Masters':15, ' Doctorate':16,
                              })
data.shape

(30162, 41)

## now replace >50k and <50k data with 0 and 1

In [17]:
data['Income'] = data['Income'].map({' <=50K': 0, ' >50K':1})

In [18]:
data.head(2)

Unnamed: 0,age,fnlwgt,education,education-num,race,capital-gain,capital-loss,hours-per-week,native-country,Income,...,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,relationship_ Not-in-family,relationship_ Other-relative,relationship_ Own-child,relationship_ Unmarried,relationship_ Wife,sex_ Male
0,39,77516,14,13,White,2174,0,40,United-States,0,...,0,0,0,0,1,0,0,0,0,1
1,50,83311,14,13,White,0,0,13,United-States,0,...,0,0,0,0,0,0,0,0,0,1


## removing not used columns

In [19]:
data = data.drop(columns=['fnlwgt', 'race', 'capital-gain', 'capital-loss', 'native-country'])

In [20]:
## spitting X and Y values
X = data.drop('Income', axis=1)
y = data['Income']

## data seems imbalanced we need to make them balanced

In [21]:
from imblearn.combine import SMOTETomek
#smotetomek does both downsampling and upsampling of class

In [22]:
smote = SMOTETomek(0.9)
X, y = smote.fit_sample(X, y)



In [23]:
y.value_counts()

0    22169
1    19903
Name: Income, dtype: int64

In [24]:
print(X.shape)
X = pd.DataFrame(data=X, columns=X.columns)

(42072, 35)


## Doing train_test_split

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [5]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

## Implementing RandomForest

In [28]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

In [29]:
forest = RandomForestClassifier()

In [30]:
forest_grid_params = {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'min_samples_split': [x for x in range(2,5)],
                     'max_leaf_nodes': [2, 3, 4, None],
                     'min_samples_leaf': [x for x in range(1,5)],
                     'n_estimators' : [90, 100, 110, 120, 130,140, 150, 160, 170, 180, 190, 200],
                     }

In [31]:
forest_cv = RandomizedSearchCV(forest, forest_grid_params,n_iter=400, n_jobs=-1, cv=5, verbose=2)

In [32]:
forest_cv.fit(X_train, y_train)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed:  9.3min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 18.0min
[Parallel(n_jobs=-1)]: Done 644 tasks      | elapsed: 29.6min
[Parallel(n_jobs=-1)]: Done 1009 tasks      | elapsed: 47.9min
[Parallel(n_jobs=-1)]: Done 1454 tasks      | elapsed: 70.6min
[Parallel(n_jobs=-1)]: Done 1981 tasks      | elapsed: 92.2min
[Parallel(n_jobs=-1)]: Done 2000 out of 2000 | elapsed: 93.2min finished


RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(), n_iter=400,
                   n_jobs=-1,
                   param_distributions={'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_leaf_nodes': [2, 3, 4, None],
                                        'min_samples_leaf': [1, 2, 3, 4],
                                        'min_samples_split': [2, 3, 4],
                                        'n_estimators': [90, 100, 110, 120, 130,
                                                         140, 150, 160, 170,
                                                         180, 190, 200]},
                   verbose=2)

In [33]:
forest_cv.best_score_

0.8723177124858121

In [34]:
forest_cv.best_estimator_

RandomForestClassifier(min_samples_leaf=2, n_estimators=190)

In [35]:
forest = RandomForestClassifier(min_samples_leaf=2, n_estimators=190)

In [36]:
forest.fit(X_train, y_train)

RandomForestClassifier(min_samples_leaf=2, n_estimators=190)

In [37]:
forest_y_predicted = forest.predict(X_test)

In [40]:
sc = accuracy_score(y_test, forest_y_predicted)
sc

0.8721990125332321

In [29]:
confusion_matrix(y_test, forest_y_predicted)

array([[4889,  596],
       [ 503, 4530]], dtype=int64)

In [27]:
print(classification_report(y_test, forest_y_predicted))

              precision    recall  f1-score   support

           0       0.91      0.89      0.90      5485
           1       0.88      0.90      0.89      5033

    accuracy                           0.90     10518
   macro avg       0.90      0.90      0.90     10518
weighted avg       0.90      0.90      0.90     10518



In [42]:
Pkl_Filename = "Income_Model_Random_Forest.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(forest, file)

In [2]:
import pickle
model = pickle.load(open("Income_Model_Random_Forest.pkl", "rb"))

In [7]:
predicted = model.predict([[40,9,9,40,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0]])
predicted

array([0], dtype=int64)