In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [42]:
data = pd.read_csv("census-income_.csv", na_values='?', skipinitialspace = True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Unnamed: 14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [43]:
data.columns = data.columns.str.replace(' ', '_')
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'Unnamed:_14'],
      dtype='object')

In [44]:
data.columns = data.columns.str.replace('-', '.')
data.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'Unnamed:_14'],
      dtype='object')

In [45]:
data['workclass'].unique()

array(['State-gov', 'Self-emp-not-inc', 'Private', 'Federal-gov',
       'Local-gov', nan, 'Self-emp-inc', 'Without-pay', 'Never-worked'],
      dtype=object)

In [46]:
data['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [47]:
data.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
Unnamed:_14          0
dtype: int64

In [48]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,32561.0,38.581647,13.640433,17.0,28.0,37.0,48.0,90.0
fnlwgt,32561.0,189778.366512,105549.977697,12285.0,117827.0,178356.0,237051.0,1484705.0
education.num,32561.0,10.080679,2.57272,1.0,9.0,10.0,12.0,16.0
capital.gain,32561.0,1077.648844,7385.292085,0.0,0.0,0.0,0.0,99999.0
capital.loss,32561.0,87.30383,402.960219,0.0,0.0,0.0,0.0,4356.0
hours.per.week,32561.0,40.437456,12.347429,1.0,40.0,40.0,45.0,99.0


In [49]:
#Data Manipulation
#Extractracting the “education” column and store it in “census_ed” 

census_ed = data[['education']]
census_ed.head()

Unnamed: 0,education
0,Bachelors
1,Bachelors
2,HS-grad
3,11th
4,Bachelors


In [11]:
# Extracting all the columns from “age” to “relationship” and store it in “census_seq”

census_seq = data.iloc[:,0:8]
census_seq.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife


In [12]:
# Extract the column number “0”, “5”, “6” and store it in “census_col”

census_col=data.iloc[:,[0,5,6]]
census_col.head()

Unnamed: 0,age,marital.status,occupation
0,39,Never-married,Adm-clerical
1,50,Married-civ-spouse,Exec-managerial
2,38,Divorced,Handlers-cleaners
3,53,Married-civ-spouse,Handlers-cleaners
4,28,Married-civ-spouse,Prof-specialty


In [13]:
# Ectracting all the male employees who work in state-gov and store it in “male_gov”

male_gov = data[(data['workclass'] == 'State-gov') & (data['sex'] == 'Male')]
male_gov.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Unnamed:_14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
34,22,State-gov,311512,Some-college,10,Married-civ-spouse,Other-service,Husband,Black,Male,0,0,15,United-States,<=50K
48,41,State-gov,101603,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
123,29,State-gov,267989,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,>50K


In [14]:
# Exctracting all Female employees who work in Private

male_gov = data[(data['workclass'] == 'Private') & (data['sex'] == 'Female')]
male_gov.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Unnamed:_14
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
12,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,<=50K


In [15]:
data.head()


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Unnamed:_14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [16]:
#Extract all the 39 year olds who either have a bachelor's degree or who are native of the United States and store the result in “census_us”.
#We are writing a condition where age should be 39 and storing it in census. This condition is then taken and applied on the next one to get the result.

census = data[(data['age'] == 39)]

#census
census_us = census[(census['education'] == 'Bachelors') | (census['native.country'] == ' United-States')]
census_us.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Unnamed:_14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
848,39,Private,138192,Bachelors,13,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
915,39,Private,202027,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,45,United-States,>50K
999,39,Self-emp-inc,329980,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,15024,0,50,United-States,>50K
1336,39,Private,174938,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Male,0,0,40,United-States,<=50K


In [23]:
# sample() helps us to get random rows

census_200 = data.sample(200)
census_200.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,Unnamed:_14,income
5620,49,Self-emp-inc,187563,Some-college,10,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States,>50K,>50K
24134,38,Private,203138,Bachelors,13,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,50,United-States,>50K,>50K
14403,40,Private,365986,Bachelors,13,Married-civ-spouse,Handlers-cleaners,Own-child,White,Male,0,0,40,United-States,>50K,>50K
28120,35,Private,196178,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K,<=50K
6075,52,Private,133403,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,55,United-States,<=50K,<=50K


In [18]:
# Getting the count of different levels of the “workclass” column.

data[['workclass']].value_counts()

workclass       
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
dtype: int64

In [19]:
# Here we are calculating the mean of capital gain on the bases of work class

data.groupby(['workclass'])['capital.gain'].mean()

workclass
Federal-gov          833.232292
Local-gov            880.202580
Never-worked           0.000000
Private              889.217792
Self-emp-inc        4875.693548
Self-emp-not-inc    1886.061787
State-gov            701.699538
Without-pay          487.857143
Name: capital.gain, dtype: float64

In [20]:
# Create a separate dataframe with the details of males and females from the census data that has income more than 50,000

data['sex'].value_counts()

Male      21790
Female    10771
Name: sex, dtype: int64

In [50]:
data = data.rename(columns = {'Unnamed:_14': 'income'})
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [24]:
data['Unnamed:_14'].value_counts()


<=50K    24720
>50K      7841
Name: Unnamed:_14, dtype: int64

In [25]:
data['income'].value_counts()


<=50K    24720
>50K      7841
Name: income, dtype: int64

In [26]:
data = pd.DataFrame(data)
data.drop("Unnamed:_14", axis=1, inplace=True)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [27]:
male_50k = data[(data['sex'] == 'Male') & (data['income'] == '>50K')]

female_50K = data[(data['sex'] == 'Female') & (data['income'] == '>50K')]


In [28]:
male_50k.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,,>50K


In [30]:
female_50K.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
19,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K
52,47,Private,51835,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,1902,60,Honduras,>50K
67,53,Private,169846,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K
84,44,Private,343591,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,14344,0,40,United-States,>50K


In [31]:
# Checking % of people from US who earn <=50,000 anually and work under private

p_le50k = data[(data['native.country'] == 'United-States') & (data['workclass'] == 'Private') & (data['income'] == ' <=50K')]

total = len(data)

percentage = (len(p_le50k)/total) * 100
percentage

0.0

In [32]:
# Calculating the percentage of married people in the census data

data['marital.status'].value_counts()

Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: marital.status, dtype: int64

In [33]:
# Calculating % of married people

total = len(data['marital.status'])

married = 14976+413+23

precen_mar = (married/total) * 100
precen_mar

47.33269862719204

In [34]:
# Calculatig the percentage of high school graduates earning more than 50,000 annually

data['education'].value_counts()

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [35]:
hs_m50k = len(data[(data['education'] == 'HS-grad') & (data['income'] == '>50K')])

percentage_hs_m50k = (hs_m50k/total) * 100

percentage_hs_m50k

5.144190903227788

# Linear Regression Model


In [36]:
#Importing libraries

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [37]:
lr=LinearRegression()

In [38]:
# independent variable is “education.num”.
x = data[['education.num']]

# dependent variable is “hours.per.week”
y = data['hours.per.week']

In [39]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .30, random_state = 1)

lr.fit(x_train, y_train)

y_pred = lr.predict(x_test)

In [40]:
error = y_test-y_pred
error

9646     30.044869
709     -13.159243
7385      7.432533
16671     0.371349
21932     1.840757
           ...    
29663    -1.832763
29310     0.371349
29661    -0.363355
19491    -1.098059
2861      5.514277
Name: hours.per.week, Length: 9769, dtype: float64

In [41]:
print('mean_squared_error :', mean_squared_error(y_test, y_pred))

print('root-mean-square error :', np.sqrt(mean_squared_error(y_test, y_pred)))

mean_squared_error : 147.15261838664162
root-mean-square error : 12.130647896408568


# **Logistic Regression Model**

In [51]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [52]:
lo = LogisticRegression()

In [53]:
data.shape

(32561, 15)

In [54]:
data[['occupation']].value_counts()

occupation       
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
dtype: int64

In [55]:
# occupation is indpendent

x = data['occupation']
x = pd.DataFrame(x)

In [57]:
x[['occupation']].value_counts()

occupation       
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
dtype: int64

In [58]:
data['income'].value_counts()

<=50K    24720
>50K      7841
Name: income, dtype: int64

In [59]:
y = data['income'].replace('<=50K', 0).replace('>50K', 1)
y.value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [60]:
len(y)

32561

In [61]:
# Giving particular labels to the elements in my columns

L = LabelEncoder()

In [62]:
x = L.fit_transform(x)

  y = column_or_1d(y, warn=True)


In [64]:
x = pd.DataFrame(x)
x.head()

Unnamed: 0,0
0,0
1,3
2,5
3,5
4,9


In [65]:
x.value_counts()

9     4140
2     4099
3     4066
0     3770
11    3650
7     3295
6     2002
14    1843
13    1597
5     1370
4      994
12     928
10     649
8      149
1        9
dtype: int64

In [66]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.35, random_state = 1)

lo = LogisticRegression()

lo.fit(x_train, y_train)

y_pred = lo.predict(x_test)

print('confusion_matrix :')
print(confusion_matrix(y_pred,y_test))
print('accuracy_score :',accuracy_score(y_test,y_pred))

confusion_matrix :
[[8800 2597]
 [   0    0]]
accuracy_score : 0.7721330174607353


In [67]:
data.head(1)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K


In [68]:
d = data[['age', 'workclass', 'education']]
d.head()

Unnamed: 0,age,workclass,education
0,39,State-gov,Bachelors
1,50,Self-emp-not-inc,Bachelors
2,38,Private,HS-grad
3,53,Private,11th
4,28,Private,Bachelors


In [69]:
d['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [70]:
x = d.apply(L.fit_transform)
x.head()

Unnamed: 0,age,workclass,education
0,22,6,9
1,33,5,9
2,21,3,11
3,36,3,1
4,11,3,9


In [71]:
d['workclass'].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64

In [72]:
d.head()

Unnamed: 0,age,workclass,education
0,39,State-gov,Bachelors
1,50,Self-emp-not-inc,Bachelors
2,38,Private,HS-grad
3,53,Private,11th
4,28,Private,Bachelors


In [73]:
y = data['income'].replace('<=50K', 0).replace('>50K', 1)

In [74]:
y.value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [75]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.20, random_state = 1)

lo = LogisticRegression()

lo.fit(x_train, y_train)

y_pred = lo.predict(x_test)

print('confusion_matrix :')
print(confusion_matrix(y_pred, y_test))
print('accuracy_score :', accuracy_score(y_test, y_pred))

confusion_matrix :
[[4887 1448]
 [ 139   39]]
accuracy_score : 0.756333486872409


# **Decision Tree Model**

In [76]:
from sklearn.tree import DecisionTreeClassifier

In [77]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [79]:
data.education = data.education.replace(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th','10th', '11th', '12th'], 'School')

data.education = data.education.replace(['Assoc-voc', 'Assoc-acdm', 'Prof-school', 'Some-college'], 'Higher')

data['marital.status'] = data['marital.status'].replace(['Married-civ-spouse', 'Married-AF-spouse'], 'married')

data['marital.status'] = data['marital.status'].replace(['Never-married'], 'not-married')

data['marital.status'] = data['marital.status'].replace(['Divorced', 'Separated', 'Widowed', 'Married-spouse-absent'], 'other')

data.income = data.income.replace('<=50K', 0)

data.income = data.income.replace('>50K', 1)

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,39,State-gov,77516,Bachelors,13,not-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,married,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,other,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,School,7,married,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,married,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [80]:
data = data.apply(L.fit_transform)

data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,22,6,2671,0,12,1,0,1,4,1,25,0,39,38,0
1,33,5,2926,0,12,0,3,0,4,1,0,0,12,38,0
2,21,3,14086,2,8,2,5,1,4,1,0,0,39,38,0
3,36,3,15336,5,6,0,5,0,2,1,0,0,39,38,0
4,11,3,19355,0,12,0,9,5,2,0,0,0,39,4,0


In [81]:
#independent
x = data.iloc[:,:-1]

#dependent
y = data.iloc[:,-1]

In [82]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =.30, random_state = 1)

dt = DecisionTreeClassifier()

dt.fit(x_train, y_train)

y_pred=dt.predict(x_test)

print('confusion_matrix :')
print(confusion_matrix(y_pred, y_test))
print('accuracy_score :', accuracy_score(y_test, y_pred))

confusion_matrix :
[[6560  839]
 [ 990 1380]]
accuracy_score : 0.8127751049237384


# **Random Forest Model**

In [83]:
from sklearn.ensemble import RandomForestClassifier

In [84]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .20, random_state = 1)
rf = RandomForestClassifier(n_estimators = 300)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

print('confusion_matrix :')
print(confusion_matrix(y_pred, y_test))
print('accuracy_score :', accuracy_score(y_test, y_pred))

confusion_matrix :
[[4642  521]
 [ 384  966]]
accuracy_score : 0.8610471364962383


The linear regression model has a mean squared error of 147.15 and a root-mean-square error of 12.13. These metrics suggest that the model's predictions are not too far off from the actual income levels, but there is still some room for improvement.

The logistic regression model has the same accuracy score and confusion matrix as the linear regression model, which means it correctly classified 77% of the observations in the test set. However, since logistic regression is a classification model and not a regression model, it may be more appropriate for this dataset.

The decision tree model has an accuracy score of 0.81, which means it correctly classified 81% of the observations in the test set. Its confusion matrix shows that it had more false negatives than false positives, indicating that it is better at identifying individuals with higher incomes than those with lower incomes.

The random forest model has an even higher accuracy score of 0.86, suggesting that it is a more accurate model than the decision tree. Its confusion matrix also shows that it had fewer false negatives than the decision tree, indicating that it is better at identifying individuals with lower incomes than the decision tree.

Overall, it seems that the random forest model performs the best out of the four models evaluated, with the highest accuracy score and the fewest false negatives.