# 1. Import libraries

In [84]:
import time
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import ParameterGrid, GridSearchCV

# 2. Loading datasets online with adding columns name

In [3]:
# 
def load_data():
    train_set = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',
        header=None,
        sep=',\s*',
        engine='python')
    test_set = pd.read_csv(
        'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test',
        header=None,
        sep=',\s*',
        engine='python',
        skiprows=1)
    train_set.columns = [
        'Age', 'Workclass', 'fnlwgt', 'Education', 'Education_num',
        'Marital_status', 'Occupation', 'Relationship', 'Race', 'Sex',
        'Capital_gain', 'Capital_loss', 'Hours_per_week', 'Native_country',
        'Salary'
    ]
    test_set.columns = [
        'Age', 'Workclass', 'fnlwgt', 'Education', 'Education_num',
        'Marital_status', 'Occupation', 'Relationship', 'Race', 'Sex',
        'Capital_gain', 'Capital_loss', 'Hours_per_week', 'Native_country',
        'Salary'
    ]
    return train_set, test_set


train_set, test_set = load_data()

## 2.1 Check shape of datasets

In [4]:
#print(train_set[:5])
#print('-----' * 30)
#print(test_set[:5])
print('Shape of training set is:', train_set.shape)
print('Shape of test set is:', test_set.shape)

Shape of training set is: (32561, 15)
Shape of test set is: (16281, 15)


# 3. Data preprocessing
### Here are four functions for: 
### (1). Evaluate how many missing values in both train and test set.
### (2). In test data set, there are "." at the end of salary. Delete them in order to align with training set.
### (3). Delete samples which have missing values.
### (4). Replace function, which will be used to replace string to number in features.

In [5]:
def check_missing_value(data):
    num = data.shape[0]
    for i, j in zip(data.columns,
                    (data.values.astype(str) == '?').sum(axis=0)):
        if j != 0:
            percentage = float(j / num * 100)
            print('Missing value in '+ str(i) + ': ' + '{0:.3f}%'.format(percentage))


def remove_period(data):
    data.replace(['<=50K.', '>50K.'], ['<=50K', '>50K'], inplace=True)
    return data


def remove_missing_value(data):
    data = data.replace({'?': None})
    data = data.dropna(axis=0)
    return data


def adult_replace(data, a, b):
    '''
    a: The feature(column) name
    b: The string which should be replaced by '1'
    Other string will be replaced to 0
    '''
    data[str(a)] = data[str(a)].map(lambda x:1 if x == str(b) else 0)
    return data

## 3.1 Evaluate the percentage of missing values

In [6]:
check_missing_value(train_set)
print('-----' * 10)
check_missing_value(test_set)

Missing value in Workclass: 5.639%
Missing value in Occupation: 5.660%
Missing value in Native_country: 1.790%
--------------------------------------------------
Missing value in Workclass: 5.915%
Missing value in Occupation: 5.933%
Missing value in Native_country: 1.683%


### The impact is small. 
### Thus, delete those samples with missing values and also remove '.' in salary feature.

In [7]:
train_set = remove_missing_value(remove_period(train_set))
test_set = remove_missing_value(remove_period(test_set))

### Check the shape and store number of training sample which will be used in chapter 3.3

In [8]:
num_train = train_set.shape[0]
print('Shape of training set is:', train_set.shape)
print('Shape of test set is:', test_set.shape)
print('Number of training set sample is: ', num_train)

Shape of training set is: (30162, 15)
Shape of test set is: (15060, 15)
Number of training set sample is:  30162


## 3.2 Analyze and preprocess each feature in training set and test set

#### Feature Description (From https://archive.ics.uci.edu/ml/datasets/adult):
- **age**: continuous. 
- **workclass**: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. 
- **fnlwgt**: continuous. 
- **education**: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. 
- **education-num**: continuous. 
- **marital-status**: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. 
- **occupation**: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. 
- **relationship**: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 
- **race**: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 
- **sex**: Female, Male. 
- **capital-gain**: continuous. 
- **capital-loss**: continuous. 
- **hours-per-week**: continuous. 
- **native-country**: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.

#### There is another feature "salary", which contains <=50K and >50K, will be discussed more in chapter 3.2.9
#### Firstly, processing all features that contain string values. Continuous values are in chapter 3.3

### 3.2.1 Combine two data sets to reduce the amount of operations. The new data set called 'entire_set'.

In [9]:
entire_set = pd.concat([train_set, test_set])
print('The shape of entire data set before preprocessing is: ', entire_set.shape)

The shape of entire data set before preprocessing is:  (45222, 15)


### 3.2.2 Workclass
#### Summarize values into four categories: Without_pay，Private，Gov，Self_emp
#### This feature cannot be represented by '0' and '1' and will be divided into more class, which will be explained in 3.2.10.

In [10]:
entire_set.replace(['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 'Local-gov', 'State-gov', 'Without-pay', 'Never-worked'],
                 ['Private','Self_emp','Self_emp','Gov', 'Gov','Gov','Without_pay','Without_pay'],
                 inplace = True)

### 3.2.3 Education
#### This feature may can be presented by Education_num feature. Check them by using pivot table.

In [11]:
education_Pivot = pd.pivot_table(train_set, index = 'Education', values = 'Education_num')
education_Pivot.sort_values(by=['Education_num'])

Unnamed: 0_level_0,Education_num
Education,Unnamed: 1_level_1
Preschool,1
1st-4th,2
5th-6th,3
7th-8th,4
9th,5
10th,6
11th,7
12th,8
HS-grad,9
Some-college,10


#### It is obvious that Education_num is series number of Education. Thus, the Education feature can be deleted.

In [12]:
entire_set.drop('Education', axis=1, inplace=True)

### 3.2.4 Martial-status feature
#### Summarize values into two categories: 'married' and 'no_married'. Represented by '1' and '0' respectively.
#### Note the feature 'Married-spouse-absent' is classified in 'no_married' due to definition of term.

In [13]:
entire_set.replace(['Never-married','Divorced','Separated', 'Widowed', 'Married-spouse-absent','Married-AF-spouse','Married-civ-spouse'], 
                   ['no_married','no_married','no_married','no_married', 'no_married','married','married'], inplace = True)
entire_set = adult_replace(entire_set, 'Marital_status', 'married')
# adult_replace function has been defined in chapter 3

### 3.2.5 Occupation feature
#### Summarize values into five categories:
#### 'Technology', 'Craft', 'Service', 'Management' and 'Force'.
#### This feature cannot be represented by '0' and '1' and will be divided into more class, which will be explained in 3.2.10.

In [14]:
entire_set.replace([
    'Tech-support', 'Craft-repair', 'Other-service', 'Sales',
    'Exec-managerial', 'Prof-specialty', 'Handlers-cleaners',
    'Machine-op-inspect', 'Adm-clerical', 'Farming-Fishing',
    'Transport-moving', 'Priv-house-serv', 'Protective-serv', 'Armed-Forces'
], [
    'Technology', 'Craft', 'Service', 'Service', 'Management', 'Service',
    'Craft', 'Technology', 'Service', 'Craft', 'Service', 'Service', 'Service',
    'Force'
], inplace=True)

### 3.2.6 Relationship feature
#### Summarize values into two categories: 'In_relation' and 'Not_in_relation'. Represented by '1' and '0' respectively.

In [15]:
entire_set.replace(['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried'],
                 ['In_relation','Not_in_relation','In_relation','Not_in_relation', 'Not_in_relation','Not_in_relation'],
                 inplace = True)
entire_set = adult_replace(entire_set, 'Relationship', 'In_relation')

### 3.2.7 Sex feature
#### Values only have two categories: 'Male' and 'Female'. Transfer to '1' and '0' respectively.

In [16]:
entire_set = adult_replace(entire_set, 'Sex', 'Male')

### 3.2.8 Native-country feature
#### (1). Statistics of the proportion of the United States in training set and test set respectively
#### (2). Simply use the number of US divide all

In [17]:
num_us_train = train_set.apply(pd.value_counts)
print(num_us_train)
print('-----' * 20)
num_us_test = test_set.apply(pd.value_counts)
print(num_us_test)

                    Age  Workclass  fnlwgt  Education  Education_num  \
0                   NaN        NaN     NaN        NaN            NaN   
1                   NaN        NaN     NaN        NaN           45.0   
2                   NaN        NaN     NaN        NaN          151.0   
3                   NaN        NaN     NaN        NaN          288.0   
4                   NaN        NaN     NaN        NaN          557.0   
5                   NaN        NaN     NaN        NaN          455.0   
6                   NaN        NaN     NaN        NaN          820.0   
7                   NaN        NaN     NaN        NaN         1048.0   
8                   NaN        NaN     NaN        NaN          377.0   
9                   NaN        NaN     NaN        NaN         9840.0   
10                  NaN        NaN     NaN        NaN         6678.0   
11                  NaN        NaN     NaN        NaN         1307.0   
12                  NaN        NaN     NaN        NaN         10

#### (3). In training set, the percentage of US is: 27504/30162 -> 91.2%
#### (4). In test set, the percentage of US is: 13788/15060 -> 91.6%
#### (5). Thus, the US can be set to '1' and other countries set to '0'

In [18]:
entire_set = adult_replace(entire_set, 'Native_country', 'United-States')

### 3.2.9 Salary feature
#### Values only have two categories: '>50K' and '<=50K'. Transfer to '1' and '0' respectively.
#### Salary is the feature that will be used in train model and predictions. Should be extracted as label after preprocessing. 

In [19]:
entire_set = adult_replace(entire_set, 'Salary', '>50K')

### 3.2.10 Workclass, Occupation and Race
#### These three features can not be divided into two categories. In other words, they need to be split into multiple categories and added to the data frame by using 'dummy' function.

In [20]:
dummy_class = ['Workclass', 'Race', 'Occupation']
without_dummy_class = ['Age', 'fnlwgt', 'Education_num','Marital_status', 'Relationship', 'Sex',
             'Capital_gain','Capital_loss','Hours_per_week', 'Native_country', 'Salary']
dummy = pd.get_dummies(entire_set[dummy_class])
entire_set = pd.concat([entire_set[without_dummy_class], dummy], axis=1)

## 3.3 Others
### The rest of preprocessing is:
#### (1). Extract Salary feature from entire data set

In [21]:
salary = entire_set['Salary'].values
entire_set.drop(['Salary'], axis=1, inplace=True)

#### (2). Standardise values which originally in continuous

In [22]:
standardised_feature = ['Age', 'fnlwgt', 'Education_num', 'Capital_gain','Capital_loss','Hours_per_week']
entire_set[standardised_feature] = entire_set[standardised_feature].apply(lambda x: (x-np.mean(x))/np.std(x))

#### (3). Transfer to numpy array due to the requirement in algorithms

In [23]:
entire_set = np.array(entire_set)

#### (4). Divide training set and test set

In [24]:
x_train = entire_set[0:num_train, :]
y_train = salary[0:num_train]
x_test = entire_set[num_train:]
y_test = salary[num_train:]

#### (5). Check the shape

In [25]:
print('The shape of training data after preprocessing is: ', x_train.shape)
print('The shape of training label after preprocessing is: ', y_train.shape)
print('The shape of test data after preprocessing is: ', x_test.shape)
print('The shape of test label after preprocessing is: ', y_test.shape)

The shape of training data after preprocessing is:  (30162, 26)
The shape of training label after preprocessing is:  (30162,)
The shape of test data after preprocessing is:  (15060, 26)
The shape of test label after preprocessing is:  (15060,)


# 4. Algorithms
### There are three algorithms: Random Forest, SVM and Logistic Regression
### Each of them contains hyperparameter tunning, 10-fold cross validation and confusion matrix.

## 4.1 Random Forest

### 4.1.1 Random Forest with all default parameters and cross validation

In [50]:
# Training
train_start_time = time.time()
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
train_end_time = time.time()
print('Training time is: ' + str(round(train_end_time - train_start_time, 4)) + 's')

# Prediction
test_start_time = time.time()
print('Accuracy is: ', rf.score(x_test, y_test))
test_end_time = time.time()
print('Prediction time is: ' + str(round(test_end_time - test_start_time, 4)) + 's')

# Confusion Matrix
pred = rf.predict(x_test)
matrix = confusion_matrix(y_test, pred)
print('Confusion Matrix is\n{}'.format(matrix))

# 10-Fold Cross Validation
rf_scores = cross_val_score(rf, x_train, y_train, cv=10, scoring = 'accuracy')
print('10-fold CV accuracy: %.4f +/- %.4f' % (np.mean(rf_scores), np.std(rf_scores)))

# Precision, recall, etc
print(classification_report(y_test,pred))



Training time is: 0.5426s
Accuracy is:  0.8391102257636123
Prediction time is: 0.0466s
Confusion Matrix is
[[10502   858]
 [ 1565  2135]]
10-fold CV accuracy: 0.8418 +/- 0.0047


### 4.1.2 Hyperparameter tunning in Random Forest

In [37]:
rf.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### Use grid search. It will take a lot of time here.

In [80]:
param_grid = {
    'min_samples_leaf': [1, 5, 10, 15, 20, 25],
    'max_features': ['sqrt', 'log2', 0.5, 0.6, 0.7],
    'max_depth': [10, 20, 30, 50, 100],
    'n_estimators': [10, 20, 50, 70, 100],
    'n_jobs': [-1],
}
param_list = ParameterGrid(param_grid)
cv = GridSearchCV(rf, param_grid)
rf_cv = cv.fit(x_train, y_train)
print('Grid in progress:\n')
means = rf_cv.cv_results_['mean_test_score']
stds = rf_cv.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, rf_cv.cv_results_['params']):
    print("%0.3f (+/-%0.03f) when %r" % (mean, std * 2, params))
print('The best choice is: ', rf_cv.best_params_)



Grid in progress:

0.855 (+/-0.009) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 10, 'n_jobs': -1}
0.855 (+/-0.008) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 20, 'n_jobs': -1}
0.857 (+/-0.009) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 50, 'n_jobs': -1}
0.858 (+/-0.009) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 70, 'n_jobs': -1}
0.857 (+/-0.007) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 100, 'n_jobs': -1}
0.855 (+/-0.008) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 10, 'n_jobs': -1}
0.855 (+/-0.009) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 20, 'n_jobs': -1}
0.857 (+/-0.009) when {'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 50, 'n_jobs': -1}
0.856 (+/-0.

#### The best choice is: {'max_depth': 100, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 100, 'n_jobs': -1}
#### Use this combination to run algorithm again.

### 4.1.3 Random Forest with best hyperparameters

In [86]:
# Training
train_start_time = time.time()
rf_new = RandomForestClassifier(max_depth=100, max_features='sqrt', min_samples_leaf=5, n_estimators=100, n_jobs=-1)
rf_new.fit(x_train,y_train)
train_end_time = time.time()
print('Training time is: ' + str(round(train_end_time - train_start_time, 4)) + 's')

# Prediction
test_start_time = time.time()
print('Accuracy is: ', rf_new.score(x_test, y_test))
test_end_time = time.time()
print('Prediction time is: ' + str(round(test_end_time - test_start_time, 4)) + 's')

# Confusion Matrix
pred_new = rf_new.predict(x_test)
matrix_new = confusion_matrix(y_test, pred)
print('Confusion Matrix is\n{}'.format(matrix))

# 10-Fold Cross Validation
rf_new_scores = cross_val_score(rf_new, x_train, y_train, cv=10, scoring = 'accuracy')
print('10-fold CV accuracy: %.4f +/- %.4f' % (np.mean(rf_new_scores), np.std(rf_new_scores)))

# Precision, recall, etc
print(classification_report(y_test,pred_new))

Training time is: 0.553s
Accuracy is:  0.8584993359893758
Prediction time is: 0.1116s
Confusion Matrix is
[[10706   654]
 [ 1469  2231]]
10-fold CV accuracy: 0.8599 +/- 0.0053
              precision    recall  f1-score   support

           0       0.88      0.94      0.91     11360
           1       0.77      0.60      0.68      3700

   micro avg       0.86      0.86      0.86     15060
   macro avg       0.83      0.77      0.79     15060
weighted avg       0.85      0.86      0.85     15060

