In [7]:
# This program classifies the type of volcano based on the vibrations detected by sensors.

# Getting libraries
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

In [8]:
class VolcanoClassifier:
    """
    This class is used to classify the type of volcano based on the vibrations detected by sensors. Contains methods to
    preprocess the data, train the model, and test the model, as well as methods to save the predictions and the model.
    The model used is a Random Forest Classifier.
    """

    def __init__(self):
        """
        Inizialize the class and define the model to use
        """
        self.model = None
        self.model_path = "models/"
        self.data_train = pd.read_csv('data/jm_train.csv')
        self.data_test = pd.read_csv('data/jm_X_test.csv')
        self.data_local_train = pd.DataFrame()
        self.data_local_test = pd.DataFrame()
        self.best_params = dict()
        self.rdn_number = np.random.randint(200)

    def preprocess_data(self):
        """
        This function preprocesses the data to be used in the model. Perform a normal scaling to both, 
        train and test data set. 
        The training set is split into two different sets, based on the class distribution:
            1.- 80% of the data is just for local training.
            2.- 20% of the data is only for local testing.
            
        """

        # Scale the training data
        scale = StandardScaler()
        scale.fit(self.data_train.drop("target", axis=1))
        x_scaled = scale.transform(self.data_train.drop("target", axis=1))
        columns_names = self.data_train.columns
        self.data_train = pd.concat([pd.DataFrame(x_scaled), self.data_train.target], axis=1)
        self.data_train.columns = columns_names
            
        # Scale the test data
        scale.fit(self.data_test)
        x_test_scaled = scale.transform(self.data_test)
        columns_names = self.data_test.columns
        self.data_test = pd.DataFrame(x_test_scaled)
        self.data_test.columns = columns_names

#       # Ensuring a sample well distributed  
        def sample(group):
            return group.sample(frac=0.8, random_state= self.rdn_number)

        # Define a new training and test set
        self.data_local_train = self.data_train.groupby("target", as_index=False).apply(sample).reset_index(drop=True)
        self.data_local_test = self.data_train.drop(self.data_local_train.index)
        

    def grid_search(self):
        """
        This function is used to optimize the hyper-parameters of the Random Forest model using GridSearchCV.
        It performs only over the local training set.
        """

        def params_grid():
            """
            This function defines the hyper-parameters to be used in the GridSearchCV.
            """
            # Number of trees in random forest
#             n_estimators = [int(x) for x in np.linspace(start=10, stop=1500, num=10)]
            n_estimators = [int(x) for x in np.linspace(start=672, stop=100, num=1)]
            # Number of features to consider at every split
#             max_features = ['auto', 'sqrt']
            max_features = ['sqrt']
            # Maximum number of levels in tree
#             max_depth = [int(x) for x in np.linspace(1, 63, num=9)]
            max_depth = [int(x) for x in np.linspace(8, 50, num=1)]
#             max_depth.append(None)
            # Minimum number of samples required to split a node
#             min_samples_split = [2, 3, 5]
            min_samples_split = [2]
            # Minimum number of samples required at each leaf node
#             min_samples_leaf = [1, 2, 4]
            min_samples_leaf = [1]
            # Method of selecting samples for training each tree
#             bootstrap = [True, False]
            bootstrap = [True]
#             criterion = ['gini', 'entropy']
            criterion = ['entropy']

            # Create the random grid
            random_grid_ = {'n_estimators': n_estimators,
                            'max_features': max_features,
                            'max_depth': max_depth,
                            'min_samples_split': min_samples_split,
                            'min_samples_leaf': min_samples_leaf,
                            'bootstrap': bootstrap,
                            'criterion': criterion}
            
            return random_grid_

        random_grid = params_grid()
        rf = RandomForestClassifier()
        grid_search = GridSearchCV(estimator=rf, param_grid=random_grid, cv=2, n_jobs=-1, verbose=0)
        grid_search.fit(self.data_local_train, self.data_local_train.target)
        
        self.best_params = grid_search.best_params_

    def train_model(self):
        """
        This function trains the model using the local training set and calculates the F1 score over the local test set.
        A stratified Kfold is used to train the model with the hyper-parameter optimized over several training sets. 
        The model with the best F1 score is selected to use in the evaluation step.
        """

        # Split the data into several stratified training and test set
        sk_fold = StratifiedKFold(n_splits=9, shuffle=True, random_state=self.rdn_number)

        # Create the model
        temp_model = RandomForestClassifier(
            n_estimators=self.best_params["n_estimators"],
            max_depth=self.best_params["max_depth"],
            max_features=self.best_params["max_features"],
            criterion=self.best_params["criterion"],
            min_samples_split=self.best_params["min_samples_split"],
            min_samples_leaf=self.best_params["min_samples_leaf"],
            bootstrap=self.best_params["bootstrap"]
        )

        # Train the model
        x_train = self.data_local_train.drop("target", axis=1)
        y_train = self.data_local_train.target

        prediction = None
        temp_f1_score = 0
        for (train_index, test_index) in sk_fold.split(self.data_local_train, self.data_local_train['target']):
            
            temp_model.fit(x_train.iloc[train_index], y_train.iloc[train_index])
            prediction = temp_model.predict(x_train.iloc[test_index])
            f1_score = metrics.f1_score(y_train.iloc[test_index], prediction, average='macro')
            
            if f1_score > temp_f1_score:
                temp_f1_score = f1_score
                self.model = temp_model

        # Save the model
        with open(self.model_path + "rfc_model", 'wb') as file:
            pickle.dump(self.model, file)
            
    def evaluate_model(self):
        """
        This function evaluates the model over the local test set (to calculate the final f1_score) as well as the 
        test set from the competition, which targets is unknown. Finally, save the results for competition in the file
        "y_pred" in the data folder and the model.
        """
        
        x_test = self.data_local_test.drop("target", axis=1)
        y_test = self.data_local_test.target
        
        # Evaluate the model
        y_pred = self.model.predict(x_test)
        f1_score = metrics.f1_score(y_test, y_pred, average='macro')
        prinnit("Local TEST", f1_score)

        # Save the results
        y_pred = self.model.predict(self.data_test)
        pd.DataFrame(y_pred, columns=["target"]).to_csv('data/y_pred.csv', header=True, index=False)
        
        # Save the model
        with open(self.model_path + "rfc_model", 'wb') as file:
            pickle.dump(self.model, file)
        
        return f1_score


In [9]:
%%time


vc = VolcanoClassifier()
vc.preprocess_data()
vc.grid_search()
vc.train_model()
vc.evaluate_model()

Local TEST 0.8726885844346575
Wall time: 31.3 s


0.8726885844346575