<a href="https://colab.research.google.com/github/AjiSiwi/arunika-temuin/blob/master/Machine%20Learning/Community_Classification_Tuning_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [None]:
import tensorflow.python.util.deprecation as deprecation

deprecation._PRINT_DEPRECATION_WARNINGS = False

In [None]:
dataset = pd.read_excel('preprocessed_RIASEC_5Class_V2.xlsx', engine = 'openpyxl')
dataset.head(n = 10)

In [None]:
dataset.drop(columns = ['Unnamed: 0'], inplace = True)
data_cols = list(dataset.columns)
data_cols.remove('major')
data = dataset[data_cols].values
labels = dataset['major'].values

In [None]:
labels = labels.reshape((1, labels.shape[0]))
transformer = ColumnTransformer([('one_hot_encoder', OneHotEncoder(sparse = False), [0])], 
                                remainder = 'passthrough')
labels = transformer.fit_transform(labels.T)

In [None]:
def create_model(input_shape, learning_rate = 1e-4, dropout_rate = 0.2, optimizer = None, 
                 kernel_init = None):
    """
    Create a three layers DNN model with 2 dropout layers for regularization. 
    
    Keyword Argument:
    input_shape -- a tuple defining the input_shape of the first layer. The value equals to the number of
                   features used.
    learning_rate -- defines the learning rate to be used for the default Adam optimizer. Default
                     value is 1e-4.
    dropout_rate -- defines the percentage of total weights to be dropped. Default value is 0.2, 
                    meaning 20% of total weights are zeroed.
    optimizer -- if None, Adam will be used as default optimizer. Pass a tf.keras.optimizers method
                 to use another optimizer.
    """
    if not kernel_init:
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(units = 128, input_shape = input_shape, activation = 'relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(units = 64, activation = 'relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(units = 5, activation = 'softmax')
        ])
    else:
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(units = 128, input_shape = input_shape, kernel_initializer = kernel_init,
                                  activation = 'relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(units = 64, kernel_initializer = kernel_init, activation = 'relu'),
            tf.keras.layers.Dropout(dropout_rate),
            tf.keras.layers.Dense(units = 5, kernel_initializer = kernel_init, activation = 'softmax')
        ])
    if not optimizer:
        model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = learning_rate),
                      loss = 'categorical_crossentropy', metrics = ['accuracy'])
    else:
        model.compile(optimizer = optimizer, loss = 'categorical_crossentropy', metrics = ['accuracy'])
    return model

In [None]:
def predict_assess(model, validation_data, validation_label):
    """
    Make predictions based on an array of data and assess the prediction using 
    accuracy, precision, and recall.
    
    Keyword Argument:
    model -- the model for making predictions.
    
    validation_data -- an array of data for making predictions.
    
    validation_label -- an array of labels of the data in validation_data.
    """
    predictions = model.predict(validation_data)
    predictions = np.array([np.argmax(prediction) for prediction in predictions])
    actuals = np.array([np.argmax(actual) for actual in validation_label])
    accuracy = accuracy_score(predictions, actuals)
    precision = precision_score(predictions, actuals, average = 'weighted')
    recall = recall_score(predictions, actuals, average = 'weighted')
    return accuracy, precision, recall

In [None]:
def training_report(accuracy, precision, recall, num_features = None):
    """
    Print a training report based on model's performance. 
    
    Keyword Argument:
    accuracy -- the accuracy score of the model.
    
    precision -- precision score.
    
    recall -- recall score.
    
    num_features -- a scalar value defining the number of features used during training.
                    If None, all features are assumed to be in use.
    """
    if not num_features:
        print('Accuracy, Precision, and Recall for all features')
    else:
        print('Accuracy, Precision, and Recall for {} features'.format(num_features))
    print('Accuracy: {}'.format(acc))
    print('Precision: {}'.format(prec))
    print('Recall: {}'.format(rec))

In [None]:
training_history = {'num_features': [], 'acc': [], 'prec': [], 'rec': []}
def add_to_history(hist_dict, n_features, acc, prec, rec):
    """
    Add model's performance result to hist_dict dictionary containing 'num_features',
    'acc', 'prec', and 'rec' keys. The value of all keys are in lists.
    
    Keyword Argument:
    hist_dict -- the dictionary where training performance will be saved. Must contain
                 'num_features', 'acc', 'prec', and 'rec' keys in which all of them must be
                 lists.
    
    n_features -- number of features used during training. The value will be added to 
                  'num_features'.
                  
    acc -- the obtained accuracy of a model.
    
    prec -- precision score of a model.
    
    rec -- recall score.
    """
    hist_dict['num_features'].append(n_features)
    hist_dict['acc'].append(acc)
    hist_dict['prec'].append(prec)
    hist_dict['rec'].append(rec)   

In [None]:
data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size = 0.2)

In [None]:
classifier = RandomForestClassifier(n_estimators = 1500, random_state = 42, n_jobs = -1)
classifier.fit(data_train, label_train)
feat_labels = dataset.columns[:-1]
importances = classifier.feature_importances_
indices = np.argsort(importances)[::-1]

In [None]:
plt.figure(figsize = (12, 4))
plt.title('Feature Importances')
plt.bar(range(data_train.shape[1]), importances[indices], color='lightblue', align='center')
plt.xticks(range(data_train.shape[1]), feat_labels[indices], rotation=90)
plt.xlim([-1, data_train.shape[1]])
plt.tight_layout()
plt.show()

In [None]:
n_features = 52
new_data = dataset[feat_labels[indices[:n_features]]].values
data_train, data_test, label_train, label_test = train_test_split(new_data, labels, test_size = 0.2, random_state = 42)

In [None]:
save_model_path = os.path.join(os.getcwd(), 'models')
if not os.path.isdir(save_model_path):
    os.mkdir(save_model_path)

In [None]:
train_acc = []
combinations = []
initializers = [tf.keras.initializers.GlorotUniform(), tf.keras.initializers.GlorotNormal()]
dropout_rates = [value / 10 for value in range(1, 6)]
batch_sizes = [16, 32, 64, 128]
model_checkpoint = tf.keras.callbacks.ModelCheckpoint(filepath = save_model_path + '\rec_model.h5', save_best_only = True,
                                                      monitor = 'val_accuracy', verbose = 1)
for initializer in initializers:
    for rate in dropout_rates:
        for size in batch_sizes:
            combinations.append((initializers.index(initializer) + 1, rate, size))
            model = create_model(input_shape = (n_features, ), dropout_rate = rate, kernel_init = initializer)
            history = model.fit(data_train, label_train, batch_size = size, epochs = 80, 
                                validation_data = (data_test, label_test), callbacks = [model_checkpoint])
            acc, prec, rec = predict_assess(model, data_test, label_test)
            training_report(acc, prec, rec)
            train_acc.append(sum(history.history['accuracy']) / len(history.history['accuracy']))
            add_to_history(training_history, n_features, acc, prec, rec)

In [None]:
for key in training_history.keys():
    if len(training_history[key]) != 40:
        training_history[key] = training_history[key][-40:]

In [None]:
training_history['combinations'] = combinations
training_history['train_acc'] = train_acc
try:
    del training_history['num_features']
except:
    pass
history_df = pd.DataFrame(training_history)
history_df.to_csv('history_lesser_dense.csv')

In [None]:
# history_df = pd.DataFrame(training_history)
# history_df.to_csv('history.csv')