# Advanced Machine Learning
by; Ari Sulistiyo Prabowo
____

**Contents:**
1. Handling imbalance dataset
2. Dimensionality Reduction (Feature selection)
3. Explainable AI

## Handling imbalance dataset
Using human capital data to predict whether the employee should be **promoted (1)** or **not promoted (0)**.

In [3]:
# import library
import pandas as pd
from collections import Counter

# import ML library
from sklearn import svm, preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

# Pre-process
from imblearn.over_sampling import SMOTE #oversampling
from sklearn.utils import resample #oversampling & #undersampling
from sklearn.feature_selection import SelectKBest, f_classif, RFE, SelectFromModel

# import evaluation metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# visualisation
import matplotlib.pyplot as plt
import shap

import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: ignored

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/densaiko/data_science_learning/main/dataset/Human%20Capital.csv")
display(df.shape)
df.head()

In [None]:
# target variable (is_promoted)
df['is_promoted'].value_counts(normalize=True)

The target variable is imbalance. Therefore, we need to make it at least almost balance.

### Data Preprocessing

In [None]:
# Select desired columns
df = df[["department","education","gender","recruitment_channel",
             "no_of_trainings","age","previous_year_rating","length_of_service","awards_won","avg_training_score","is_promoted"]]

In [None]:
df.head()

In [None]:
labelencoder = LabelEncoder()
data = df.dropna()

data["department"] = data["department"].astype('object')
data["department"] = labelencoder.fit_transform(data["department"])
data["education"] = labelencoder.fit_transform(data["education"])
data["gender"] = labelencoder.fit_transform(data["gender"])
data["recruitment_channel"] = labelencoder.fit_transform(data["recruitment_channel"])
display(data.shape)
data.head()

In [None]:
data_xx = df[["education"]]
data_xx['label_education'] = data['education']
data_xx['department'] =  data['department']
data_xx['label_department'] =  df['department']
data_xx['gender'] =  data['gender']
data_xx['label_gender'] =  df['gender']
data_xx.head()

In [None]:
# without handling imbalance dataset

# Separating dependent and independent variable
X = data.drop(columns="is_promoted") #independent variable
y = data["is_promoted"] #dependent variable

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [None]:
print(X_train)

In [None]:
# modelling with logistic regression
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [None]:
# Evaluation
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)

training_acc = accuracy_score(y_train, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

### Applied Oversampling

In [None]:
# Handling imbalance data
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=43, sampling_strategy=0.5)

# Separating dependent and independent variable
X = data.drop(columns="is_promoted") #independent variable
y = data["is_promoted"] #dependent variable

In [None]:
# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

# Fit the over sampling
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

print("Before over sampling: {}".format(Counter(y_train)))
print("After over sampling: {}".format(Counter(y_train_smote)))

In [None]:
# modelling with logistic regression
clf = LogisticRegression()
clf.fit(X_train_smote, y_train_smote)

# Evaluation
y_predict_train = clf.predict(X_train_smote)
y_predict_test = clf.predict(X_test)

training_acc = accuracy_score(y_train_smote, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

In [None]:
confusion_matrix(y_test, y_predict_test)

## Dimensionality Reduction

### Filter method

In [None]:
filter = SelectKBest(f_classif, k=5)
filter.fit(X_train, y_train)

X_train_filter = filter.transform(X_train)
X_test_filter = filter.transform(X_test)

print("Before feature selection", X_train.shape)
print("After feature selection", X_train_filter.shape)

In [None]:
print("Score of features", filter.scores_)

feature_importance = pd.Series(filter.scores_, index=X_train.columns)
feature_importance.sort_values().plot(kind='barh')
plt.show()

In [None]:
print(X_train)

In [None]:
X_train.columns

In [None]:
# modelling with logistic regression
clf = LogisticRegression()
clf.fit(X_train_filter, y_train)

# Evaluation
y_predict_train = clf.predict(X_train_filter)
y_predict_test = clf.predict(X_test_filter)

training_acc = accuracy_score(y_train, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

### Wrapper method (RFE)

In [None]:
from sklearn.feature_selection import RFE

In [None]:
wrapper = RFE(clf, n_features_to_select=5)
wrapper.fit(X_train, y_train)

X_train_wrapper = wrapper.transform(X_train)
X_test_wrapper = wrapper.transform(X_test)

print("Before feature selection", X_train.shape)
print("After feature selection", X_train_wrapper.shape)
print()

print("Score of features", wrapper.ranking_)

feature_importance = pd.Series(wrapper.ranking_, index=X_train.columns)
feature_importance.sort_values().plot(kind='barh')
plt.show()

In [None]:
# modelling with logistic regression
clf = LogisticRegression()
clf.fit(X_train_wrapper, y_train)

# Evaluation
y_predict_train = clf.predict(X_train_wrapper)
y_predict_test = clf.predict(X_test_wrapper)

training_acc = accuracy_score(y_train, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

### Embedded Method

In [None]:
from sklearn.feature_selection import SelectFromModel

In [None]:
clf = LogisticRegression()
clf_feature = SelectFromModel(clf)

clf_feature.fit(X_train, y_train)

X_train_importance = clf_feature.transform(X_train)
X_test_importance = clf_feature.transform(X_test)

print("Before feature selection", X_train.shape)
print("After feature selection", X_train_importance.shape)
print()

print("Coef",clf_feature.estimator_.coef_[0])
print("Treshold",clf_feature.threshold_)

feature_importance = pd.Series(clf_feature.estimator_.coef_[0], index=X_train.columns)
feature_importance.sort_values().plot(kind='barh')
plt.show()

In [None]:
# modelling with logistic regression
clf = LogisticRegression()
clf.fit(X_train_importance, y_train)

# Evaluation
y_predict_train = clf.predict(X_train_importance)
y_predict_test = clf.predict(X_test_importance)

training_acc = accuracy_score(y_train, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

## Explainable AI

In [None]:
# without handling imbalance dataset

# Separating dependent and independent variable
X = data.drop(columns="is_promoted") #independent variable
y = data["is_promoted"] #dependent variable

# Performing train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [None]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

explainer = shap.LinearExplainer(clf, X_train)
shap_values = explainer.shap_values(X_test)
print('Expected Values',explainer.expected_value)

The above metric of **expected value is -2.75** will be used as **base value**. The value above the base value will predict the people who get a promotion (target = 1) and vice versa (target = 0)

The features in **red** one push the prediction to predict this person who gets a **promotion** and **blue** one push the prediction to predict otherwise.

In [None]:
# Evaluation
y_predict_train = clf.predict(X_train)
y_predict_test = clf.predict(X_test)

training_acc = accuracy_score(y_train, y_predict_train)
testing_acc = accuracy_score(y_test, y_predict_test)

print("Training Accuracy: {}".format(training_acc))
print("Testing Accuracy: {}".format(testing_acc))

print(classification_report(y_test, y_predict_test))

In [None]:
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [None]:
prediction = y_test.to_frame()
prediction['result'] = y_predict_test
prediction['Summarize'] = prediction['result'] + prediction['is_promoted']
prediction[prediction['Summarize'] == 0][40:50]

In [None]:
prediction[prediction['Summarize'] == 2]

In [None]:
# show the result of y_test data from row 1, 5, 10
display(y_test[42])
display(y_predict_test[42])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[42,:],
                X_test.iloc[42,:])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[56,:],
                X_test.iloc[56,:])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[48,:],
                X_test.iloc[48,:])

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[9267,:],
                X_test.iloc[9267,:])