In [1]:
# import the relevant packages
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Preprocessing the data

### Training dataset

In [2]:
# load the dataset
train_data = pd.read_csv("C:/Users/aleksandar.dimitrov/Desktop/Census_income_train.csv")

In [3]:
# inspect the dataset
train_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
len(train_data)

32560

In [5]:
# No null or NaN values
train_data.isnull().sum()

Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Income            0
dtype: int64

#### Removing rows with unknown values ('?')

In [6]:
# All missing or unknow values, however, are marked with a question mark (?)
# There are 3 columns which contain '?' - Workclass, Occupation, Native-country

In [7]:
# Let's start with the Workclass column
# We can obtain a list of boolean values indicating whether there is a '?' on the current row
train_data["Workclass"].str.contains("\?")

0        False
1        False
2        False
3        False
4        False
         ...  
32555    False
32556    False
32557    False
32558    False
32559    False
Name: Workclass, Length: 32560, dtype: bool

In [8]:
# Let's reverse all the boolean values
train_data["Workclass"].str.contains("\?") == False

0        True
1        True
2        True
3        True
4        True
         ... 
32555    True
32556    True
32557    True
32558    True
32559    True
Name: Workclass, Length: 32560, dtype: bool

In [9]:
# Take the subset of the dataframe rows which don't contain '?'
clean_train_data = train_data[train_data["Workclass"].str.contains("\?") == False]

In [10]:
len(clean_train_data)

30724

In [11]:
# Let's do the same for 'Occupation'
clean_train_data = clean_train_data[clean_train_data["Occupation"].str.contains("\?") == False]

In [12]:
len(clean_train_data)

30717

In [13]:
# And for 'Native-country'
clean_train_data = clean_train_data[clean_train_data["Native-country"].str.contains("\?") == False]

In [14]:
len(clean_train_data)

30161

In [15]:
# Finally, let's reset the index
clean_train_data = clean_train_data.reset_index(drop=True)

#### Creating dummy variables and separating inputs and targets

In [16]:
# In the original data, there are both categorical and numerical data
# Decision trees and random forest can work with categorical data in general
# However, this is not implemented in sklearn
# So, we need to convert the categorical data to numerical
# We will do that with one hot encoding

In [17]:
# Pandas can automatically do that for us with '.get_dummies'
train_dummies = pd.get_dummies(clean_train_data, drop_first=False)

In [18]:
train_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ <=50K,Income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [19]:
# The last 2 columns are whether the income <= 50k and whether it is >50k
# Both of these carry the same information, so we will remove one of them
train_dummies = train_dummies.drop(['Income_ <=50K'],axis=1)

In [20]:
train_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# The input features are everything besides the last column
train_input = train_dummies.iloc[:,:-1]

# The target/output is just the last column
train_target = train_dummies.iloc[:,-1]

In [22]:
train_input.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Portugal,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
train_target.head()

0    0
1    0
2    0
3    0
4    0
Name: Income_ >50K, dtype: uint8

### Test dataset

In [24]:
# Let's do the same preprocessing on the test dataset

In [25]:
# Load test data
test_data = pd.read_csv("C:/Users/aleksandar.dimitrov/Desktop/Census_income_train.csv")

In [26]:
test_data.head()

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [27]:
len(test_data)

32560

#### Cleaning unknown ('?') values

In [28]:
clean_test_data = test_data[test_data["Workclass"].str.contains("\?") == False]

In [29]:
len(clean_test_data)

30724

In [30]:
clean_test_data = clean_test_data[clean_test_data["Occupation"].str.contains("\?") == False]

In [31]:
len(clean_test_data)

30717

In [32]:
clean_test_data = clean_test_data[clean_test_data["Native-country"].str.contains("\?") == False]

In [33]:
len(clean_test_data)

30161

In [34]:
clean_test_data = clean_test_data.reset_index(drop=True)

#### Creating dummy variables and sepratting inputs and targets

In [35]:
test_dummies = pd.get_dummies(clean_test_data, drop_first=False)

In [36]:
test_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ <=50K,Income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,1,0,0,1,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [37]:
print(test_dummies.columns)


Index(['Age', 'fnlwgt', 'Education-num', 'Capital-gain', 'Capital-loss',
       'Hours-per-week', 'Workclass_ Federal-gov', 'Workclass_ Local-gov',
       'Workclass_ Private', 'Workclass_ Self-emp-inc',
       ...
       'Native-country_ Scotland', 'Native-country_ South',
       'Native-country_ Taiwan', 'Native-country_ Thailand',
       'Native-country_ Trinadad&Tobago', 'Native-country_ United-States',
       'Native-country_ Vietnam', 'Native-country_ Yugoslavia',
       'Income_ <=50K', 'Income_ >50K'],
      dtype='object', length=105)


In [38]:
test_dummies = test_dummies.drop('Income_ <=50K', axis=1)


In [39]:
test_dummies.head()

Unnamed: 0,Age,fnlwgt,Education-num,Capital-gain,Capital-loss,Hours-per-week,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Private,Workclass_ Self-emp-inc,...,Native-country_ Puerto-Rico,Native-country_ Scotland,Native-country_ South,Native-country_ Taiwan,Native-country_ Thailand,Native-country_ Trinadad&Tobago,Native-country_ United-States,Native-country_ Vietnam,Native-country_ Yugoslavia,Income_ >50K
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
test_input = test_dummies.iloc[:,:-1]
test_target = test_dummies.iloc[:,-1]

In [41]:
test_target.head()

0    0
1    0
2    0
3    0
4    0
Name: Income_ >50K, dtype: uint8

## Creating and training the model

In [42]:
# Initialize the model as a random forest classifier
clf = RandomForestClassifier()

In [43]:
# Train the model
clf.fit(train_input,train_target)

## Testing the model

In [44]:
# Obtain the model's predictions on the test dataset
test_pred = clf.predict(test_input)

In [45]:
# Print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22653
           1       1.00      1.00      1.00      7508

    accuracy                           1.00     30161
   macro avg       1.00      1.00      1.00     30161
weighted avg       1.00      1.00      1.00     30161



## Creating and training the model

In [46]:
# Initialize the model as a random forest classifier with 150 trees (default is 100 trees)
clf = RandomForestClassifier(n_estimators = 150)

In [47]:
# Train the model
clf.fit(train_input,train_target)

## Testing the model

In [48]:
# Obtain the model's predictions on the test dataset
test_pred = clf.predict(test_input)

In [49]:
# Print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     22653
           1       1.00      1.00      1.00      7508

    accuracy                           1.00     30161
   macro avg       1.00      1.00      1.00     30161
weighted avg       1.00      1.00      1.00     30161



In [50]:
# The result is basically the same as before, so the additional trees didn't help at all

## Creating and training the model

In [51]:
# Initialize the model as a random forest classifier with pruning
clf = RandomForestClassifier(ccp_alpha = 0.0001)

In [52]:
# Train the model
clf.fit(train_input,train_target)

## Testing the model

In [53]:
# Obtain the model's predictions on the test dataset
test_pred = clf.predict(test_input)

In [54]:
# Print the metrics obtained from the real targets and our model's predictions
print(classification_report(test_target, test_pred))

              precision    recall  f1-score   support

           0       0.89      0.95      0.92     22653
           1       0.82      0.65      0.73      7508

    accuracy                           0.88     30161
   macro avg       0.86      0.80      0.82     30161
weighted avg       0.87      0.88      0.87     30161



In [55]:
# A slight increase in accuracy however it is insignificant
# This is the limit of the performance on this dataset