In [100]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error, confusion_matrix

In [101]:
census=pd.read_csv('census-income.csv')

In [102]:
census=pd.DataFrame(census)

In [103]:
print(type(census))

<class 'pandas.core.frame.DataFrame'>


# Data Preprocessing

In [104]:
census.isnull().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
                   0
dtype: int64

In [105]:
census=census.dropna(axis=0)

# Data Manipulation

## 1)

In [106]:
census_ed=census[' education']

## 2)

In [107]:
census_seq=census.iloc[:,:8]

## 3)

In [108]:

census_col=census.iloc[:,[5,8,11]]

## 4) 

In [109]:
male_gov=census[(census[' sex']==' Male')&(census[' workclass']==' State-gov')]

## 5)

In [110]:
census_us=census[census['age']==39]

In [111]:
census_us=census_us[(census_us[' education']==' Bachelors')|(census_us[' native-country']==' United-States')]

In [112]:
census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Unnamed: 15
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## 6)

In [113]:
census_200=census.sample(n=200)

## 7)

In [114]:
census[[' workclass']].value_counts()

 workclass       
 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
dtype: int64

In [115]:
census_Private=census[census[' workclass']==' Private']
census_Self=census[census[' workclass']==' Self-emp-not-inc']
census_Localgov=census[census[' workclass']==' Local-gov']
census_Stategov=census[census[' workclass']==' State-gov']
census_selfinc=census[census[' workclass']==' Self-emp-inc']
census_Federalgov=census[census[' workclass']==' Federal-gov']
census_withoutpay=census[census[' workclass']==' Without-pay']
census_Neverworked=census[census[' workclass']==' Never-worked']

In [116]:
census_Private[' capital-gain'].mean()

889.2177916813536

In [117]:
census_Self[' capital-gain'].mean()

1886.0617866981504

In [118]:
census_Localgov[' capital-gain'].mean()

880.202580028667

In [119]:
census_Stategov[' capital-gain'].mean()

701.6995377503852

In [120]:
census_selfinc[' capital-gain'].mean()

4875.693548387097

In [121]:
census_Federalgov[' capital-gain'].mean()

833.2322916666667

In [122]:
census_withoutpay[' capital-gain'].mean()

487.85714285714283

In [123]:
census_Neverworked[' capital-gain'].mean()

0.0

## 8)

In [124]:
census_morethan50000=census[census[' ']==' >50K']

In [125]:
census_morethan50000[census_morethan50000[' sex']==' Male']

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Unnamed: 15
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
10,37,Private,280464,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,Black,Male,0,0,80,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
14,40,Private,121772,Assoc-voc,11,Married-civ-spouse,Craft-repair,Husband,Asian-Pac-Islander,Male,0,0,40,?,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32532,34,Private,204461,Doctorate,16,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,60,United-States,>50K
32533,54,Private,337992,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,Asian-Pac-Islander,Male,0,0,50,Japan,>50K
32539,71,?,287372,Doctorate,16,Married-civ-spouse,?,Husband,White,Male,0,0,10,United-States,>50K
32554,53,Private,321865,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K


In [126]:
census_morethan50000[census_morethan50000[' sex']==' Female']

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Unnamed: 15
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
19,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States,>50K
52,47,Private,51835,Prof-school,15,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,1902,60,Honduras,>50K
67,53,Private,169846,HS-grad,9,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,40,United-States,>50K
84,44,Private,343591,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,14344,0,40,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32530,35,?,320084,Bachelors,13,Married-civ-spouse,?,Wife,White,Female,0,0,55,United-States,>50K
32536,34,Private,160216,Bachelors,13,Never-married,Exec-managerial,Not-in-family,White,Female,0,0,55,United-States,>50K
32538,38,Private,139180,Bachelors,13,Divorced,Prof-specialty,Unmarried,Black,Female,15020,0,45,United-States,>50K
32545,39,Local-gov,111499,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,0,20,United-States,>50K


## 9)

In [127]:
(census[(census[' workclass']==' Private')&(census[' ']==' <=50K')].shape[0]/census.shape[0])*100

54.460858081754246

## 10)

In [128]:
census[' marital-status'].value_counts()

 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name:  marital-status, dtype: int64

## 11)

In [129]:
print("Percentage of Married people are - "+str(((14796+418+23)/census.shape[0])*100))

Percentage of Married people are - 46.79524584625779


In [130]:
census[' education'].value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name:  education, dtype: int64

In [131]:
census[' '].iloc[32560]

' >50K'

In [132]:
df=census[(census[' education']==' HS-grad')&(census[' ']==' >50K')]

## 12)

In [133]:
print('The percentage of high school graduates earning more than 50k is - '+str((df.shape[0]/census.shape[0])*100))

The percentage of high school graduates earning more than 50k is - 5.144190903227788


# Logistic Regression

In [141]:
census.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' '],
      dtype='object')

In [142]:
census[[' ']].value_counts()

      
 <=50K    24720
 >50K      7841
dtype: int64

In [143]:
le=LabelEncoder()

In [144]:
census[' ']==' >50K'

0        False
1        False
2        False
3        False
4        False
         ...  
32556    False
32557     True
32558    False
32559    False
32560     True
Name:  , Length: 32561, dtype: bool

In [145]:
le.fit([' <=50K',' >50K'])

LabelEncoder()

In [146]:
census['Yearly Income']=le.transform(census[' '])# <=50K-->0;;; >50K-->1

In [147]:
census=census.drop([' '],axis=1)

In [148]:
census

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Yearly Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,0


In [149]:
census[' occupation']=le.fit_transform(census[' occupation'])

In [150]:
census

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,Yearly Income
0,39,State-gov,77516,Bachelors,13,Never-married,1,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,4,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,6,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,6,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,10,Wife,Black,Female,0,0,40,Cuba,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,13,Wife,White,Female,0,0,38,United-States,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,7,Husband,White,Male,0,0,40,United-States,1
32558,58,Private,151910,HS-grad,9,Widowed,1,Unmarried,White,Female,0,0,40,United-States,0
32559,22,Private,201490,HS-grad,9,Never-married,1,Own-child,White,Male,0,0,20,United-States,0


In [151]:
X=census[[' occupation']]
Y=census[['Yearly Income']]


In [152]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.35)
Model=LogisticRegression()
Model.fit(X_train,Y_train)


  return f(*args, **kwargs)


LogisticRegression()

In [153]:
y_pred=Model.predict(X_test)

In [154]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [155]:
b=confusion_matrix(Y_test,y_pred)

array([[8615,    0],
       [2782,    0]], dtype=int64)

In [156]:
Accuracy=(b[0,0]+b[1,1])/(b[0,0]+b[0,1]+b[1,0]+b[1,1])

In [157]:
Accuracy

2760.0

## Dividing the Dataset into 80:20 ratio

In [158]:
Y=census[['Yearly Income']]
X=census[['age',' workclass',' education']]

In [159]:
X[' workclass']=le.fit_transform(X[' workclass'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[' workclass']=le.fit_transform(X[' workclass'])


In [160]:
X[' education']=le.fit_transform(X[' education'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[' education']=le.fit_transform(X[' education'])


In [161]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3)

In [162]:
Model=LogisticRegression()

In [163]:
Model.fit(X_train,Y_train)

  return f(*args, **kwargs)


LogisticRegression()

In [164]:
y_pred=Model.predict(X_test)

In [165]:
c0=confusion_matrix(Y_test,y_pred)

array([[7229,  201],
       [2272,   67]], dtype=int64)

In [166]:
Accuracy_Score=(c0[0,0]+c[1,1])/(c[0,0]+c[1,1]+c[1,0]+c[0,1])

In [167]:
Accuracy_Score

0.743678984542942

# Decision Tree

In [168]:
Y=census[['Yearly Income']]
X=census.iloc[:,:-1]

In [169]:
X

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,1,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,4,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,6,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,6,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,10,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,13,Wife,White,Female,0,0,38,United-States
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,7,Husband,White,Male,0,0,40,United-States
32558,58,Private,151910,HS-grad,9,Widowed,1,Unmarried,White,Female,0,0,40,United-States
32559,22,Private,201490,HS-grad,9,Never-married,1,Own-child,White,Male,0,0,20,United-States


In [170]:
X[' workclass']=le.fit_transform(X[' workclass'])

In [171]:
X[' education']=le.fit_transform(X[' education'])

In [172]:
X[' marital-status']=le.fit_transform(X[' marital-status'])

In [173]:
X[' relationship']=le.fit_transform(X[' relationship'])

In [174]:
X[' race']=le.fit_transform(X[' race'])

In [175]:
X[' sex']=le.fit_transform(X[' sex'])

In [176]:
X[' native-country']=le.fit_transform(X[' native-country'])

In [177]:
Model=DecisionTreeClassifier()

In [178]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3)

In [179]:
Model.fit(X_train,Y_train)

DecisionTreeClassifier()

In [180]:
y_pred=Model.predict(X_test)

In [181]:
c1=confusion_matrix(Y_test,y_pred)

array([[6495,  968],
       [ 867, 1439]], dtype=int64)

In [182]:
Accuracy_score=(c1[0,0]+c1[1,1])/(c[0,0]+c[0,1]+c[1,0]+c[1,1])

In [183]:
Accuracy_score

0.8056095813286928

# Random Forest

In [184]:
Y=census[['Yearly Income']]

In [185]:
X=census.iloc[:,:-1]

In [186]:
X[' workclass']=le.fit_transform(X[' workclass'])

In [187]:
X[' education']=le.fit_transform(X[' education'])

In [188]:
X[' marital-status']=le.fit_transform(X[' marital-status'])

In [189]:
X[' relationship']=le.fit_transform(X[' relationship'])

In [190]:
X[' race']=le.fit_transform(X[' race'])

In [191]:
X[' sex']=le.fit_transform(X[' sex'])


In [192]:
X[' native-country']=le.fit_transform(X[' native-country'])

In [193]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2)

In [194]:
Model=RandomForestClassifier(n_estimators=300)

In [195]:
Model.fit(X_train,Y_train)

  Model.fit(X_train,Y_train)


RandomForestClassifier(n_estimators=300)

In [196]:
y_pred=Model.predict(X_test)

In [202]:
c=confusion_matrix(Y_test,y_pred)

In [204]:
c

array([[4577,  361],
       [ 586,  989]], dtype=int64)

In [205]:
Accuracy_score=(c[0,0]+c[1,1])/(c[0,0]+c[0,1]+c[1,0]+c[1,1])

In [206]:
Accuracy_score

0.8545984953170582