# Census Income

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/census_income.csv")
df

In [None]:
df.dtypes
df['Sex'].unique()

In [None]:
df.isnull().sum()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.columns

In [None]:
sns.countplot(x='Income',data=df)

In [None]:
df['Income'].value_counts()

In [None]:
print("%age of people having more than 50K salary: ", (round(24719/32561*100,2)))

In [None]:
print("%age of people having less than 50K salary: ", (round(7841/32561*100,2)))

In [None]:
pd.crosstab(df['Age'],df['Income'])     # analysis of age and income of indiviual

In [None]:
sns.heatmap(pd.crosstab(df['Age'],df['Income']))

In [None]:
sns.pairplot(data=df,hue='Income')

In [None]:
sns.clustermap(pd.crosstab(df['Education'],df['Income']))

In [None]:
df['Workclass'].value_counts()

In [None]:
pd.crosstab(df['Workclass'],df['Income'])

In [None]:
sns.heatmap(pd.crosstab(df['Workclass'],df['Income']))

In [None]:
sns.barplot(x=df['Occupation'],y=df['Income'],hue=df['Sex'])

In [None]:
sns.boxplot(x=df['Hours_per_week'],y=df['Income'],hue=df['Sex'])

converting the categorical data to numerical data

In [None]:
df.info()

In [None]:
df['Workclass']=df['Workclass'].map({' Self-emp-not-inc':1, ' Private':2, ' State-gov':3, ' Federal-gov':4,' Local-gov':5, ' ?':0, ' Self-emp-inc':6, ' Without-pay':7,' Never-worked':8})
df['Education']=df['Education'].map({' Bachelors':1, ' HS-grad':2, ' 11th':3, ' Masters':4, ' 9th':5,' Some-college':6, ' Assoc-acdm':7, ' Assoc-voc':8, ' 7th-8th':9,' Doctorate':10, ' Prof-school':11, ' 5th-6th':12, ' 10th':13, ' 1st-4th':14,' Preschool':15, ' 12th':16})
df['Marital_status']=df['Marital_status'].map({' Married-civ-spouse':1, ' Divorced':2, ' Married-spouse-absent':3,' Never-married':4, ' Separated':5, ' Married-AF-spouse':6, ' Widowed':7})
df['Occupation']=df['Occupation'].map({' Exec-managerial':1, 'Handlers-cleaners':2, ' Prof-specialty':3,' Other-service':4, ' Adm-clerical':5, ' Sales':6, ' Craft-repair':7,' Transport-moving':8, ' Farming-fishing':9, ' Machine-op-inspct':10,' Tech-support':11, ' ?':0, ' Protective-serv':12, ' Armed-Forces':13,' Priv-house-serv':14})
df['Relationship']=df['Relationship'].map({' Husband':1, ' Not-in-family':2, ' Wife':3, ' Own-child':4, ' Unmarried':5,' Other-relative':6})
df['Race']=df['Race'].map({' White':1, ' Black':2, ' Asian-Pac-Islander':3, ' Amer-Indian-Eskimo':4,' Other':5})
df['Sex']=df['Sex'].map({' Male':1, ' Female':2})


In [None]:
from sklearn.preprocessing import LabelEncoder 
label_encoder=LabelEncoder()
df['Income'] = label_encoder.fit_transform(df['Income'])
df['Native_country'] = label_encoder.fit_transform(df['Native_country'])

In [None]:
df.fillna(df.median(), inplace=True)
df

# Z score

In [None]:
from scipy.stats import zscore
out_features=df[['Age', 'Workclass', 'Fnlwgt', 'Education', 'Education_num',
       'Marital_status', 'Occupation', 'Relationship', 'Race', 'Sex',
       'Capital_gain', 'Capital_loss', 'Hours_per_week', 'Native_country',
       'Income']]
z=np.abs(zscore(out_features))    #
z

In [None]:
np.where(z>3)

In [None]:
df1=df[(z<3).all(axis=1)]
df1.shape

In [None]:
print("old dataframe: ",df.shape[0])
print("new dataframe: ",df1.shape[0])

In [None]:
print("data loss percentage:",((df.shape[0]-df1.shape[0])/df.shape[0])*100)

In [None]:
df1

In [None]:
df1.corr()

In [None]:
plt.figure(figsize=(26,14))
sns.heatmap(df1.corr(),annot=True,fmt='0.2f')

In [None]:
df1.corr().Income.sort_values()

In [None]:
df['Age']=np.log(df['Age'])
df['Fnlwgt']=np.log(df['Fnlwgt'])

In [None]:
#Checking the skewness of the data
plt.figure(figsize=(20,25),facecolor="green")
plotnumber=1
for column in df1:
    if plotnumber<=18:
        ax=plt.subplot(6,4,plotnumber)
        sns.distplot(df1[column],color='b')
        plt.xlabel(column,fontsize=20)
    plotnumber+=1
plt.show()

# Feature Scaling using Standard Scalarisation
for evaluation of the data

In [None]:
#featuring x and y
x=df1.drop(['Income'],axis=1)
y=df1['Income']
print("feature dimensions",x.shape)
print("label dimension",y.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x),columns=x.columns)
x

In [None]:
# checking variance inflation factor
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif=pd.DataFrame()
vif["VIF values"]=[variance_inflation_factor(x.values,i) for i in range(len(x.columns))]
vif["Features"]=x.columns

vif

# Modelling
for finding the best random state

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression


In [None]:
# finding the best random state
maxacc=0
maxrs=0
for i in range(1,200):
    x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.30,random_state=i)
    lr=LinearRegression()
    lr.fit(x_train,y_train)
    pred=lr.predict(x_test)
    acc=r2_score(y_test,pred)   # to find difference between the predicted and actual value
    if acc>maxacc: 
        maxacc=acc
        maxrs=i
print("Maximum score is",maxacc,"on Random state",maxrs)

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.25,random_state=51)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor as KNN
from sklearn.linear_model import Lasso,Ridge

In [None]:
LR=LinearRegression()
LR.fit(x_train,y_train)
pred_LR=LR.predict(x_test)
pred_train=LR.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_LR))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_LR))
print('Mean Squared Error',mean_squared_error(y_test,pred_LR))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_LR)))

In [None]:
RFR=RandomForestRegressor()
RFR.fit(x_train,y_train)
pred_RFR=RFR.predict(x_test)
pred_train=RFR.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_RFR))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_RFR))
print('Mean Squared Error',mean_squared_error(y_test,pred_RFR))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_RFR)))

In [None]:
knn=KNN()
knn.fit(x_train,y_train)
pred_knn=knn.predict(x_test)
pred_train=knn.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_knn))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_knn))
print('Mean Squared Error',mean_squared_error(y_test,pred_knn))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_knn)))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

GBR=GradientBoostingRegressor()
GBR.fit(x_train,y_train)
pred_GBR=GBR.predict(x_test)
pred_train=GBR.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_GBR))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_GBR))
print('Mean Squared Error',mean_squared_error(y_test,pred_GBR))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_GBR)))

In [None]:
ls=Lasso()
ls.fit(x_train,y_train)
pred_ls=ls.predict(x_test)
pred_train=ls.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_ls))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_ls))
print('Mean Squared Error',mean_squared_error(y_test,pred_ls))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_ls)))

In [None]:
rd=Ridge()
rd.fit(x_train,y_train)
pred_rd=rd.predict(x_test)
pred_train=rd.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_rd))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_rd))
print('Mean Squared Error',mean_squared_error(y_test,pred_rd))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_rd)))

In [None]:
from sklearn.tree import DecisionTreeRegressor 

dtr=DecisionTreeRegressor()
dtr.fit(x_train,y_train)
pred_dtr=dtr.predict(x_test)
pred_train=dtr.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_dtr))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_dtr))
print('Mean Squared Error',mean_squared_error(y_test,pred_dtr))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_dtr)))

In [None]:
from sklearn.svm import SVR

svr=SVR()
svr.fit(x_train,y_train)
pred_svr=dtr.predict(x_test)
pred_train=svr.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_svr))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_svr))
print('Mean Squared Error',mean_squared_error(y_test,pred_svr))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_dtr)))

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

etr=ExtraTreesRegressor()
etr.fit(x_train,y_train)
pred_etr=etr.predict(x_test)
pred_train=etr.predict(x_train)
print('R2 score: ',r2_score(y_test,pred_etr))
print('R2 score on training data: ',r2_score(y_train,pred_train)*100)
print('Mean Absolute Error',mean_absolute_error(y_test,pred_etr))
print('Mean Squared Error',mean_squared_error(y_test,pred_etr))
print('Root mean squared Error',np.sqrt(mean_squared_error(y_test,pred_dtr)))

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

param = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'random_state': [1, 4, 10, 20]
}

gscv = GridSearchCV(ExtraTreesRegressor(), param, cv=5)
gscv.fit(x_train, y_train)

In [None]:
gscv.best_params_

In [None]:
Model =ExtraTreesRegressor(max_depth=10,
                               min_samples_split=2,  
                               min_samples_leaf=1,
                               max_features='auto', 
                               random_state=1) 

In [None]:
Model.fit(x_train,y_train)
pred=Model.predict(x_test)
print('r2_score: ',r2_score(y_test,pred))
print('Mean Absolute error:',mean_absolute_error(y_test,pred))
print('Mean Squared error:',mean_squared_error(y_test,pred))
print('Root Mean Squared Error: ',np.sqrt(mean_squared_error(y_test,pred)))      