**<h1 style="color:yellow">Is the salary more or less than 50K?</h1>**
Ref Link: https://www.kaggle.com/uciml/adult-census-income
1. Explore the data
2. The Adult dataset is from the Census Bureau and the task is to predict whether a given adult makes more than $50,000 a year based attributes such as education, hours of work per week, etc..

**<h2 style="color:cyan">Import libraries</h2>**


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


**<h2 style="color:white">Read the data</h2>**


In [None]:
df = pd.read_csv('adult.csv')
df_original = pd.read_csv('adult.csv')
df.head()

**<h2 style="color:green">Data Cleaning</h2>**


In [None]:
df = df.replace("?", np.nan)
df.head()


In [None]:
df = df.dropna(axis=0, how='any')
df.head()

**<h2 style="color:orange">Visualization</h2>**


In [None]:
plt.figure(figsize=(12,4))
ax = sns.countplot(data = df, x = 'sex', hue="income", palette = 'rocket')

plt.xlabel("Sex", fontsize= 12)
plt.ylabel("# of People", fontsize= 12)
plt.ylim(0,20000)
plt.xticks([0,1],['Male', 'Female'], fontsize = 11)

for p in ax.patches:
    ax.annotate((p.get_height()), (p.get_x()+0.16, p.get_height()+1000))

plt.show()

In [None]:
edu = df["education"].value_counts(normalize=True)

sns.barplot(edu.values, edu.index, palette='mako')
plt.title('Education')
plt.xlabel('Number of people')
plt.ylabel('Education vs Number of people')
plt.tick_params(labelsize=12)
plt.show()

In [None]:
plt.figure(figsize=(15,5))

sns.distplot(df['hours.per.week'])
plt.ticklabel_format(style='plain', axis='x') #repressing scientific notation on x
plt.ylabel('')
plt.show()

**<h2 style="color:brown">Data Encoding</h2>**


In [None]:
dummies = pd.get_dummies(df.income)
dummies

In [None]:
merged = pd.concat([df,dummies], axis='columns') # to add prefix: ..., prefix=["Quarter_"] )
merged

In [None]:
final_data = merged.drop(['income'],axis='columns')
final_data

<h2 style="color:yellow">Splitting the data</h2>


In [None]:
final = final_data.drop(['<=50K'], axis='columns')
final 

In [None]:
final = final.drop(["fnlwgt","education","marital.status","relationship","capital.loss","native.country"],axis=1)
final

**<h2 style="color:brown">Label Encoding</h2>**


In [None]:
from sklearn.preprocessing import LabelEncoder
le_workclass = LabelEncoder()
le_race = LabelEncoder()
le_sex = LabelEncoder()
le_occupation = LabelEncoder()

In [None]:
# Create three columns in the input data frame
final['workclass'] = le_workclass.fit_transform(final['workclass'])
final['race'] = le_race.fit_transform(final['race'])
final['sex'] = le_sex.fit_transform(final['sex'])
final['occupation'] = le_sex.fit_transform(final['occupation'])
final

<h2 style="color:yellow">Defining X (input) and y (target)</h2>

In [None]:
X = final.drop([">50K"], axis = "columns")
X.head()

In [None]:
target = final['>50K']
y = target

In [None]:
y.head()

**<h2 style="color:blue">Train the Model</h2>**


In [None]:
from sklearn import tree
model = tree.DecisionTreeClassifier()

In [None]:
model.fit(X,y)

**<h2 style="color:gray">Model Evaluation</h2>**


In [None]:
model.score(X, y)

**<h2 style="color:purple">Make prediction</h2>**


In [None]:
model.predict([[41, 2, 10, 9, 4, 0, 0, 40]])