<a href="https://colab.research.google.com/github/Esandu-Meth-Obadaarachchi/Predicting-income-of-employees/blob/main/machine_learning_cw.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import requests
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats



# Importing the dataset

In [3]:
# URL of the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"

# Send a GET request to the URL
response = requests.get(url)

# Write the content of the response to a local file
with open("adult.data", "wb") as f:
    f.write(response.content)

# Load the dataset into a pandas DataFrame with specified column names
column_names = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status',
                'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss',
                'hours-per-week', 'native-country', 'income']
df = pd.read_csv("adult.data", names=column_names, na_values=' ?')
df2 = df

# CHECKING WHETHER THERE ARE MISSING STUFF

In [4]:
nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     583
income               0
dtype: int64


In [5]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [6]:
workclass_mode = (df['workclass'].mode()).iloc[0]
occupation_mode = df['occupation'].mode().iloc[0]
nativecountry_mode = df['native-country'].mode().iloc[0]

In [7]:
df['workclass'].fillna(value=workclass_mode, inplace=True)
df['occupation'].fillna(value=occupation_mode, inplace=True)
df['native-country'].fillna(value=nativecountry_mode, inplace=True)

In [8]:
nan_count = np.sum(df.isnull(), axis = 0)
print(nan_count)

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
income            0
dtype: int64


# Converting >= and <= 50k into true and false

In [9]:
df['income'] = df['income'].replace(to_replace='<=50K', value=False)
df['income']= df['income'].replace(to_replace='>50K', value=True)

In [10]:
df.income

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: income, Length: 32561, dtype: object

In [11]:
df2 = df.income
df2

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: income, Length: 32561, dtype: object

# One hot encoding

In [12]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
income            object
dtype: object

In [13]:
to_encode = df.select_dtypes(include=['object']).columns
to_encode

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country', 'income'],
      dtype='object')

In [14]:
df[to_encode].nunique()

workclass          8
education         16
marital-status     7
occupation        14
relationship       6
race               5
sex                2
native-country    41
income             2
dtype: int64

In [15]:
##Workclass
top_10_WC = df['workclass'].value_counts()[:10].sort_values(ascending=False).index
top_10_WC

##education
top_10_E = df['education'].value_counts()[:10].sort_values(ascending=False).index
top_10_E

##occupation
top_10_O = df['occupation'].value_counts()[:10].sort_values(ascending=False).index
top_10_O

##native-country
top_10_NC = df['native-country'].value_counts()[:10].sort_values(ascending=False).index
top_10_NC

Index([' United-States', ' Mexico', ' Philippines', ' Germany', ' Canada',
       ' Puerto-Rico', ' El-Salvador', ' India', ' Cuba', ' England'],
      dtype='object')

In [16]:
##Workclass
for val in top_10_WC :
    df["Workclass"+"_"+ val] = np.where((df['workclass'] == val), 1, 0)

##education
for val in top_10_E :
    df["Education"+"_"+ val] = np.where((df['education'] == val), 1, 0)


##occupation
for val in top_10_O :
    df["Occupation"+"_"+ val] = np.where((df['occupation'] == val), 1, 0)


##native-country
for val in top_10_NC :
    df["NativeCountry"+"_"+ val] = np.where((df['native-country'] == val), 1, 0)
df

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,...,NativeCountry_ United-States,NativeCountry_ Mexico,NativeCountry_ Philippines,NativeCountry_ Germany,NativeCountry_ Canada,NativeCountry_ Puerto-Rico,NativeCountry_ El-Salvador,NativeCountry_ India,NativeCountry_ Cuba,NativeCountry_ England
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,...,1,0,0,0,0,0,0,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,...,1,0,0,0,0,0,0,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,...,1,0,0,0,0,0,0,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,...,1,0,0,0,0,0,0,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,...,1,0,0,0,0,0,0,0,0,0
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,...,1,0,0,0,0,0,0,0,0,0
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,...,1,0,0,0,0,0,0,0,0,0
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,...,1,0,0,0,0,0,0,0,0,0


In [17]:
to_drop = ['workclass', 'education', 'occupation', 'native-country']
df = df.drop(columns = to_drop, axis=1)
to_encode = to_encode.drop(to_drop)

In [18]:
for name in to_encode:
    temp_df = pd.get_dummies(df[name])
    df.join(temp_df)

In [19]:
df.head()

Unnamed: 0,age,fnlwgt,education-num,marital-status,relationship,race,sex,capital-gain,capital-loss,hours-per-week,...,NativeCountry_ United-States,NativeCountry_ Mexico,NativeCountry_ Philippines,NativeCountry_ Germany,NativeCountry_ Canada,NativeCountry_ Puerto-Rico,NativeCountry_ El-Salvador,NativeCountry_ India,NativeCountry_ Cuba,NativeCountry_ England
0,39,77516,13,Never-married,Not-in-family,White,Male,2174,0,40,...,1,0,0,0,0,0,0,0,0,0
1,50,83311,13,Married-civ-spouse,Husband,White,Male,0,0,13,...,1,0,0,0,0,0,0,0,0,0
2,38,215646,9,Divorced,Not-in-family,White,Male,0,0,40,...,1,0,0,0,0,0,0,0,0,0
3,53,234721,7,Married-civ-spouse,Husband,Black,Male,0,0,40,...,1,0,0,0,0,0,0,0,0,0
4,28,338409,13,Married-civ-spouse,Wife,Black,Female,0,0,40,...,0,0,0,0,0,0,0,0,1,0


In [20]:
df = df.drop(columns = to_encode, axis=1)

In [21]:
df.columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week', 'Workclass_ Private', 'Workclass_ Self-emp-not-inc',
       'Workclass_ Local-gov', 'Workclass_ State-gov',
       'Workclass_ Self-emp-inc', 'Workclass_ Federal-gov',
       'Workclass_ Without-pay', 'Workclass_ Never-worked',
       'Education_ HS-grad', 'Education_ Some-college', 'Education_ Bachelors',
       'Education_ Masters', 'Education_ Assoc-voc', 'Education_ 11th',
       'Education_ Assoc-acdm', 'Education_ 10th', 'Education_ 7th-8th',
       'Education_ Prof-school', 'Occupation_ Prof-specialty',
       'Occupation_ Craft-repair', 'Occupation_ Exec-managerial',
       'Occupation_ Adm-clerical', 'Occupation_ Sales',
       'Occupation_ Other-service', 'Occupation_ Machine-op-inspct',
       'Occupation_ Transport-moving', 'Occupation_ Handlers-cleaners',
       'Occupation_ Farming-fishing', 'NativeCountry_ United-States',
       'NativeCountry_ Mexico', 'NativeCountry_ Phili

In [22]:
df.dtypes

age                              int64
fnlwgt                           int64
education-num                    int64
capital-gain                     int64
capital-loss                     int64
hours-per-week                   int64
Workclass_ Private               int64
Workclass_ Self-emp-not-inc      int64
Workclass_ Local-gov             int64
Workclass_ State-gov             int64
Workclass_ Self-emp-inc          int64
Workclass_ Federal-gov           int64
Workclass_ Without-pay           int64
Workclass_ Never-worked          int64
Education_ HS-grad               int64
Education_ Some-college          int64
Education_ Bachelors             int64
Education_ Masters               int64
Education_ Assoc-voc             int64
Education_ 11th                  int64
Education_ Assoc-acdm            int64
Education_ 10th                  int64
Education_ 7th-8th               int64
Education_ Prof-school           int64
Occupation_ Prof-specialty       int64
Occupation_ Craft-repair 

In [23]:
drop_col = ['fnlwgt', 'capital-gain', 'capital-loss']

In [24]:
df = df.drop(columns=drop_col, axis=1)

# Implementation of model

In [25]:
y = df2
X = df

In [26]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# Note that k=5 is specifying that we want the top 5 features
selector = SelectKBest(f_classif, k=7)
selector.fit(X, y)
filter = selector.get_support()
top_5_features = X.columns[filter]

print("Best 5 features:")
print(top_5_features)

Best 5 features:
Index(['age', 'education-num', 'hours-per-week', 'Education_ Bachelors',
       'Education_ Masters', 'Occupation_ Exec-managerial',
       'Occupation_ Other-service'],
      dtype='object')


In [27]:
import seaborn as sns
import scipy.stats as stats

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1234)

In [29]:
grid_space={'max_depth':[3,5,10],
              'n_estimators':[10,100,200],
           }

In [30]:
from sklearn.model_selection import GridSearchCV


print('Running Grid Search...')
rf = RandomForestClassifier()


grid = GridSearchCV(rf,param_grid=grid_space,cv=3,scoring='accuracy')
model_grid = grid.fit(X_train,y_train)

print('Done')

Running Grid Search...
Done


In [31]:
print('Best hyperparameters are: '+str(model_grid.best_params_))
print('Best score is: '+str(model_grid.best_score_))
best_max_depth_rf = model_grid.best_estimator_.max_depth
best_max_depth_rf
best_estimators_rf = model_grid.best_estimator_.n_estimators
best_estimators_rf

Best hyperparameters are: {'max_depth': 10, 'n_estimators': 100}
Best score is: 0.8069937370737486


100

In [33]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import GaussianNB
# 1. Instantiate the machine learning models
naive_bayes_model = GaussianNB()


# 2. Train the machine learning models
naive_bayes_model.fit(X_train, y_train)

# 3. Predict on the testing set
naive_bayes_pred = naive_bayes_model.predict(X_test)

# 4. Evaluate the performance of the models
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_pred)

print("Naive Bayes Model Accuracy:", naive_bayes_accuracy)

# You can also print classification reports for more detailed evaluation
print("Naive Bayes Model Classification Report:")
print(classification_report(y_test, naive_bayes_pred))

Naive Bayes Model Accuracy: 0.4748694851059474
Naive Bayes Model Classification Report:
              precision    recall  f1-score   support

       <=50K       0.93      0.34      0.50      7469
        >50K       0.30      0.91      0.45      2300

    accuracy                           0.47      9769
   macro avg       0.61      0.63      0.47      9769
weighted avg       0.78      0.47      0.49      9769



In [34]:
from sklearn.naive_bayes import GaussianNB

# Initialize and fit the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

# Predict probabilities and class labels
nb_predictions = list(nb_model.predict_proba(X_test)[:,1])
nb_class_label_predictions = nb_model.predict(X_test)

# Calculate accuracy score
nb_acc_score = accuracy_score(y_test, nb_class_label_predictions)

print('Naive Bayes Accuracy Score:', nb_acc_score)


Naive Bayes Accuracy Score: 0.4748694851059474


# Handling imbalanced data


In [36]:
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE technique
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train1, X_test1, y_train1, y_test1 = train_test_split(X_resampled, y_resampled, test_size=0.30, random_state=1234)
# Initialize and fit the Naive Bayes model
nb_model = GaussianNB()
nb_model.fit(X_train1, y_train1)

# Predict probabilities and class labels
nb_predictions = list(nb_model.predict_proba(X_test1)[:,1])
nb_class_label_predictions = nb_model.predict(X_test1)

# Calculate accuracy score
nb_acc_score = accuracy_score(y_test1, nb_class_label_predictions)

print('Naive Bayes Accuracy Score:', nb_acc_score)

Naive Bayes Accuracy Score: 0.6577669902912622


In [37]:
from sklearn.naive_bayes import MultinomialNB, ComplementNB

# Instantiate Multinomial Naive Bayes
multinomial_nb = MultinomialNB()

# Instantiate Complement Naive Bayes
complement_nb = ComplementNB()

multinomial_nb.fit(X_train1, y_train1)

# Predict probabilities and class labels
nb_predictions = list(multinomial_nb.predict_proba(X_test1)[:,1])
nb_class_label_predictions = multinomial_nb.predict(X_test1)

# Calculate accuracy score
nb_acc_score = accuracy_score(y_test1, nb_class_label_predictions)

print('Naive Bayes multinomial_nb Accuracy Score:', nb_acc_score)


Naive Bayes multinomial_nb Accuracy Score: 0.7463592233009708


In [38]:
complement_nb.fit(X_train1, y_train1)

# Predict probabilities and class labels
nb_predictions = list(complement_nb.predict_proba(X_test1)[:,1])
nb_class_label_predictions = complement_nb.predict(X_test1)

# Calculate accuracy score
nb_acc_score = accuracy_score(y_test1, nb_class_label_predictions)

print('Naive Bayes complement_nb Accuracy Score:', nb_acc_score)

Naive Bayes complement_nb Accuracy Score: 0.7467637540453075
