In [None]:
import pandas as pd
import numpy as np
from time import time


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from collections import Counter
from sklearn import metrics 
from sklearn.model_selection import RandomizedSearchCV


In [None]:
def split_train_validation_test_data(root="./", drive_file="/ST12000NM0007_lastlast",  
                                     ignore_columns=["date", "serial_number", "model", "capacity_bytes", "failure"], 
                                     resample_data=False, smote_data=False):

    df = pd.read_csv(root + drive_file, parse_dates=True)

    df_good = df.loc[df['failure'] == 0]
    df_bad = df.loc[df['failure'] == 1]
     
    df_good = df_good.sort_values(["date"])
    df_bad = df_bad.sort_values(["date"])

    good_y = df_good["failure"]
    bad_y = df_bad["failure"]

    X_train_good, X_test_good, y_train_good, y_test_good = train_test_split(
        df_good, good_y, train_size=0.8, shuffle=False)
    X_train_bad, X_test_bad, y_train_bad, y_test_bad = train_test_split(
        df_bad, bad_y, train_size=0.8, shuffle=False)


    if resample_data:
        X_train_bad = resample(df_bad, replace=True, n_samples=len(X_train_good), random_state=1)
        X_train_bad = X_train_bad.sort_values(["date"])

    y_train_bad = X_train_bad["failure"]

    X_train = pd.concat([X_train_good, X_train_bad], axis=0)
    y_train = pd.concat([y_train_good, y_train_bad], axis=0)

    X_test = pd.concat([X_test_good, X_test_bad], axis=0)
    y_test = pd.concat([y_test_good, y_test_bad], axis=0)

    X_train.drop(columns=ignore_columns, inplace=True, axis=1)

    X_test.drop(columns=ignore_columns, inplace=True, axis=1)

    if smote_data:
        sm = SMOTE(random_state=42)
        X_train, y_train = sm.fit_resample(X_train, y_train)

    return (X_train, X_test, y_train, y_test)

In [None]:
def sort_data_by_date(file_path):
    df = pd.read_csv(file_path, parse_dates=True)
    sorted_df = df.sort_values(["date"])
    return sorted_df

In [None]:
def random_tune_random_forest():
    rf = RandomForestClassifier(random_state=1)
    n_estimators = [int(x) for x in np.linspace(start=200, stop=2000, num=10)]
    
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num=11)]
    max_depth.append(None)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    criterion = ["gini", "entropy"]
    
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap,
                   'criterion': criterion
                   }
    
    rf_random = RandomizedSearchCV(
        estimator=rf, 
        param_distributions=random_grid, 
        n_iter=100, 
        cv=3, 
        verbose=2, 
        random_state=1, 
        n_jobs=-1, 
        scoring=["f1", "accuracy"], 
        refit="f1"
    )
    
    return rf_random
    

In [None]:
def run(models=[RandomForestClassifier(max_depth=2, random_state=0)], tune_model=False):

    X_train, X_test, y_train, y_test = split_train_validation_test_data(drive_file="/ST12000NM0007_rawlast.csv", resample_data=True)
    
    print("Data done")
    for model in models:  
        print("\n\n *", type(model)._name_) 

        start = time()
        model.fit(X_train, y_train)
        end = time()
        print("\nTime to train:", str((end - start)/60), " mins")
        
        print(model.best_params_)
        # Test set results
        print("\n- Results on test set: ")
        y_pred = model.predict(X_test)
        print("Accuracy: ", accuracy_score(y_test, y_pred))
        print("Scores:\n", classification_report(y_test, y_pred))


In [None]:
if __name__ == "_main_":
    models_list = []
    rf = random_tune_random_forest()
    models_list.append(rf)
    run(models_list, tune_model=True)

In [None]:
def run_random_forest_10(file_path="/ST12000NM0007_rawlast.csv"):

    model = RandomForestClassifier(
        n_estimators=2000, 
        min_samples_split=5, 
        min_samples_leaf=4,
        max_features='auto', 
        max_depth=40, 
        criterion='entropy',
        bootstrap=True
    )
    X_train, X_test, y_train, y_test = split_train_validation_test_data(drive_file=file_path, resample_data=True)
    
    print("Data donen")
    print("\n\n *", type(model)._name_)  

    start = time()
    model.fit(X_train, y_train)
    end = time()
    print("\nTime to train:", str((end - start)/60), " mins")
    
    # Test set results
    print("\n- Results on test set: ")
    y_pred = model.predict(X_test)
    print("Accuracy: ", accuracy_score(y_test, y_pred))
    print("Scores:\n", classification_report(y_test, y_pred))
    

In [None]:
run_random_forest_10("/ST12000NM0007_lastlast.csv")