In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 

sns.set_style('darkgrid')

In [None]:
## read the dataset 

df = pd.read_csv('income_evaluation.csv')

In [None]:
df.head()

In [None]:
## check null values in dataset

df.isnull().sum()

In [None]:
df.shape

In [None]:
df.columns = ['age', 'workclass', 'final_weight', 'education', 'education_num', 'martial_status', 'occupation', 'relationship', 'race', 'sex',
             'capital_gain', 'capital_loss', 'hrs_per_week', 'native_country', 'income']

In [None]:
df.columns

In [None]:
df.income.unique()

In [None]:
## converting income columns to 0's and 1's {0 for salary <=50K and 1 for salary >50K}
df['income'] = [1 if value == ' >50K' else 0 for value in df['income'].values]

In [None]:
## last 5 rows in dataset

df.tail()

In [None]:
df.workclass.unique()

In [None]:
## removing ? from column workclass 
df['workclass'] = np.where(df.workclass == ' ?', np.nan, df['workclass'])

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
workclass_label = {v:k for k, v in enumerate(df.workclass.unique())}

In [None]:
workclass_label

In [None]:
df.workclass = df.workclass.map(workclass_label)

In [None]:
df.education.unique()

In [None]:
education_label = {v:k for k, v in enumerate(df.education.unique())}

In [None]:
education_label

In [None]:
df.education = df.education.map(education_label)

In [None]:
plt.figure(figsize=(10, 4))
sns.countplot(x='income', data=df, hue='sex');

In [None]:
native_country = df.native_country.value_counts().nlargest(5)

plt.figure(figsize=(10, 5))
plt.xticks(rotation=75)
sns.barplot(native_country.index, np.log(native_country));

In [None]:
df.native_country = np.where(df.native_country == ' ?', np.nan, df['native_country'])

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
native_country = {v:k for k, v in enumerate(df.native_country.unique())}

In [None]:
df.native_country = df.native_country.map(native_country)

In [None]:
df.head()

In [None]:
df.martial_status.unique()

In [None]:
marital_label = {v:k for k, v in enumerate(df.martial_status.unique())}

In [None]:
df.martial_status = df.martial_status.map(marital_label)

In [None]:
df.occupation.unique()

In [None]:
df.occupation = np.where(df.occupation == ' ?', np.nan, df['occupation'])

In [None]:
df.dropna(axis=0, inplace=True)

In [None]:
occ_label = {v:k for k, v in enumerate(df.occupation.unique())}

In [None]:
df.occupation = df.occupation.map(occ_label)

In [None]:
df.relationship.unique()

In [None]:
relationship_label = {v:k for k, v in enumerate(df.relationship.unique())}

In [None]:
df.relationship = df.relationship.map(relationship_label)

In [None]:
df.head()

In [None]:
df.sex = np.where(df.sex == ' Male', 1, 0)

In [None]:
df.race.unique()

In [None]:
race_label = {v:k for k, v in enumerate(df.race.unique())}

In [None]:
race_label

In [None]:
df.race = df.race.map(race_label)

In [None]:
df.head()

In [None]:
plt.figure(figsize=(15, 15))
sns.heatmap(df.corr(), annot=True);

In [None]:
df.head()

In [None]:
for feature in ['final_weight', 'capital_gain']:
    plt.figure(figsize=(10, 5))
    sns.boxplot(df[feature])

In [None]:
df.describe()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
X = X.values
y = y.values

sc = StandardScaler()
sc.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lg = LogisticRegression()

In [None]:
lg.fit(X_train, y_train)

In [None]:
pred = lg.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
print("--------------------------------------------Classification Report---------------------------------------------")
print(classification_report(y_test, pred))

print("------------------------------------------Accuracy Score---------------------------------------------------------")
print(accuracy_score(y_test, pred))

print("--------------------------------------------Confustion Matrix-----------------------------------------------------")
plt.figure(figsize=(10,5))
sns.heatmap(confusion_matrix(y_test, pred), annot=True);