### Objective
To provide the user the functionality to use/evaluate different models.

### In-Scope
Making a highly parameterized function to test out three classification and three regression models

### Future-Scope
+ Making a GUI on top of the function
+ Add more scoring for mulit-class classifications
+ Evaluation on testing
+ Add more edge-cases for unit testing function
+ Add more models
+ Add charts 

### Phase A : Import required libraries

In [1]:
#Import required libraries
import pandas as pd
import numpy as np

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

### Phase B : Import required datasets

#### Part I : Transform dataset as per requirement

In [3]:
#Read iris pickle.
#df = pd.read_pickle('F:\model_automator-master\data\raw\iris.pickle')
df = pd.read_pickle(r'C:\Users\USER\Desktop\Model Automator\iris.pickle')

In [4]:
df.head(20)

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [5]:
#Class is in text. So we label encode.
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

#### Part II : Make train and test datasets for simulation

In [6]:
#Specify target and independent variables.
X = df.copy().drop(['class'], axis=1)
y = df['class']

In [7]:
#Train test split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
#Make a copy of the dataframes as csvs for testing with GUI later on
X_train.to_csv('../data/raw/X_train.csv')
X_test.to_csv('../data/raw/X_test.csv')
y_train.to_csv('../data/raw/y_train.csv')
y_test.to_csv('../data/raw/y_test.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/X_train.csv'

In [9]:
y_train = np.where(y_train==2, 1 ,0)

### Phase C : Make user defined function

In [14]:
#Function to evaluate different models
def model_automator(x_train, x_test, y_train, y_test, task, kfold=3, nruns=5):
    
    #Imports here as this will be packaged as a GUI later on and only this will be the source code.
    import warnings
    from collections import OrderedDict
    from time import gmtime, strftime
    from IPython.display import display
    from sklearn.model_selection import KFold, cross_val_score
    from sklearn.model_selection import GridSearchCV
    
    #Warning from scipy LAPACK to be ignored as it does not affect results.
    warnings.filterwarnings(action='ignore', module='scipy', message='^internal gelsd')
    
    #Lists to record model related metrics to be concatenated into a dataframe later on.
    record_scorer = []
    iter_scorer = []
    model_name = []
    model_accuracy = []
    model_accuracy_std = []
    
    #For classification tasks, need classification models as imports. 
    #Also for multiclass problems set the scoring metric as accuracy.
    if task == 'class':
        
        from sklearn.linear_model import LogisticRegression
        from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
        
        #Currently testing on three models. More models will be added in future versions.
        estimators = [('log', LogisticRegression()), 
                      ('rfc', RandomForestClassifier()), 
                      ('gbm', GradientBoostingClassifier())]
        
        #Check if it is a multiclass classification problem or not.
        if len(np.unique(y_train))>2:
            scoring = ['accuracy']
            
        else:
            scoring = ['accuracy', 'precision', 'recall', 'roc_auc']

    #For regression tasks, need regression models as imports. 
    elif task == 'reg':
        
        from sklearn.linear_model import LinearRegression
        from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
        
        #Currently testing on three models. More models will be added in future versions.
        estimators = [('lin', LinearRegression()), 
                      ('rfc', RandomForestRegressor()), 
                      ('gbm', GradientBoostingRegressor())]
        scoring = ['explained_variance', 'r2']
    
    #Validation check for wrong option selected.
    else : 
        print('Wrong option')
    
    #Start the process and record the time started.
    print('Process started at %s\n' % (strftime('%Y-%m-%d %H:%M:%S', gmtime())))
    
    #Iterate through scoring metrics.
    for scorer in scoring:
        
        #Iterate through the number of runs. Default is 5.
        for run in range(nruns):
            
            print('Running iteration %s with %s as scoring metric' % ((run + 1), scorer))
            
            for name, estimator in estimators:
                
                #Iterate through differnt models and get cross val score.
                cv_results = cross_val_score(estimator, x_train, y_train, cv=kfold, scoring=scorer)
                
                #Append all results in list form which will be made into a dataframe at the end.
                iter_scorer.append((run + 1))
                record_scorer.append(scorer)
                model_name.append(name)
                model_accuracy.append(cv_results.mean())
                model_accuracy_std.append(cv_results.std())
                
        print()
            
            
    #Process ends here. Record the time. 
    print('\nProcess ended at ', strftime('%Y-%m-%d %H:%M:%S', gmtime()))
    
    #Use ordered dictionary to set the dataframe in the exact order of columns declared.
    results = pd.DataFrame(OrderedDict({'Iteration' : iter_scorer, 
                                        'Scoring Metric' : record_scorer, 
                                        'Model' : model_name, 
                                        'Model Accuracy' : model_accuracy, 
                                        'Model Accuracy Std' : model_accuracy_std}))
    
    #Pivot to view results in a more aesthetic form
    results_pivot = results.pivot_table(index=['Iteration', 'Scoring Metric'], columns=['Model'])
    
    #Display the results
    print('\nFinal results : ')
    display(results_pivot)
    
    #Grid Search Parameter tuning
         
    estimators_Grid = [estimator for name, estimator in estimators]
    
    param_grid1= {'penalty': ['l1', 'l2'],'C': np.logspace(0, 4, 10)}

    param_grid2 = {
        'n_estimators': [200, 700],
        'max_features': ['auto', 'sqrt', 'log2']
    }
    param_grid3 = {
        'n_estimators': [200, 700],
        'max_features': ['auto', 'sqrt', 'log2']
    }

    param_list = [param_grid1,param_grid2,param_grid3]
    


    for i in range(0,len(estimators_Grid)):
    
        Grid_clf = GridSearchCV(estimator=estimators_Grid[i], param_grid=param_list[i], cv= 5)
    
        #print (CV_rfc.best_estimator_)
    
        best_model = Grid_clf.fit(X_train,y_train)
        print(best_model.best_estimator_)
            
        for scorer in scoring:
            cv_results_Grid = cross_val_score(best_model, x_train, y_train, cv=kfold, scoring=scorer)

            print(scorer,": ",cv_results_Grid.mean())
            print(scorer,"_Std: ",cv_results_Grid.std())

    #Display the results
    print('\nFinal results : ')
    #Return the pivot
    return(results_pivot, estimators)

### Phase D : Testing

#### Part I : Test Classification

In [15]:
results = model_automator(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test, task='class')

Process started at 2018-03-24 15:57:36

Running iteration 1 with accuracy as scoring metric
Running iteration 2 with accuracy as scoring metric
Running iteration 3 with accuracy as scoring metric
Running iteration 4 with accuracy as scoring metric
Running iteration 5 with accuracy as scoring metric

Running iteration 1 with precision as scoring metric
Running iteration 2 with precision as scoring metric
Running iteration 3 with precision as scoring metric
Running iteration 4 with precision as scoring metric
Running iteration 5 with precision as scoring metric

Running iteration 1 with recall as scoring metric
Running iteration 2 with recall as scoring metric
Running iteration 3 with recall as scoring metric
Running iteration 4 with recall as scoring metric
Running iteration 5 with recall as scoring metric

Running iteration 1 with roc_auc as scoring metric
Running iteration 2 with roc_auc as scoring metric
Running iteration 3 with roc_auc as scoring metric
Running iteration 4 with roc_

Unnamed: 0_level_0,Unnamed: 1_level_0,Model Accuracy,Model Accuracy,Model Accuracy,Model Accuracy Std,Model Accuracy Std,Model Accuracy Std
Unnamed: 0_level_1,Model,gbm,log,rfc,gbm,log,rfc
Iteration,Scoring Metric,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
1,accuracy,0.900178,0.969697,0.919786,0.027768,0.042855,0.028999
1,precision,0.867521,0.928571,0.923077,0.097263,0.101015,0.108786
1,recall,0.853535,1.0,0.881313,0.039768,0.0,0.044748
1,roc_auc,0.953627,0.99449,0.964876,0.025037,0.007792,0.025192
2,accuracy,0.900178,0.969697,0.959596,0.027768,0.042855,0.037795
2,precision,0.867521,0.928571,0.923077,0.097263,0.101015,0.108786
2,recall,0.853535,1.0,0.853535,0.039768,0.0,0.039768
2,roc_auc,0.953627,0.99449,0.960285,0.025037,0.007792,0.022975
3,accuracy,0.900178,0.969697,0.929887,0.027768,0.042855,0.037962
3,precision,0.867521,0.928571,0.923077,0.097263,0.101015,0.108786


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
accuracy :  0.9696969696969697
accuracy _Std:  0.04285495643554835
precision :  0.9285714285714285
precision _Std:  0.10101525445522108
recall :  1.0
recall _Std:  0.0
roc_auc :  0.9944903581267218
roc_auc _Std:  0.007791810261008762
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
accuracy :  0.9298871063576946
accuracy _Std:  0.03796228840157645
precision : 

#### Part II : Test Regression

In [None]:
results = model_automator(x_train=X_train, y_train=y_train, x_test=X_test, y_test=y_test, task='reg')

In [75]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression

In [76]:
logmod = LogisticRegression()

In [79]:
cv_results = cross_val_score(logmod, X_train, y_train, cv=3, scoring='accuracy')

In [81]:
cv_results.bes

array([ 0.94285714,  0.87878788,  1.        ])

In [46]:
def plotROCByModel(data,modlist):
    from sklearn.metrics import roc_curve, auc
    import matplotlib.pyplot as plt
    plt.figure(3)
    plt.plot([0, 1], [0, 1], 'k--')

    for modname in modlist:
        fpr_, tpr_,th_ = roc_curve(data['class'],  data[modname])
        plt.plot(fpr_, tpr_, label=modname + ' (AUC = %0.2f)'%(auc(fpr_, tpr_)))

    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc='best')
    plt.show()

In [59]:
plotROCByModel(X_train, ['log'])

KeyError: 'class'

In [30]:
## from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

# Set random seed
#np.random.seed(0)

# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])
#clf_arr = [LogisticRegression(),RandomForestClassifier()]

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]}]


# Create grid search 
clf = GridSearchCV(clf1,search_space, cv=5, verbose=0)
# Fit grid search
best_model = clf.fit(X_train,y_train)
# View best model
best_model.best_estimator_.get_params()['classifier']

ValueError: Invalid parameter classifier for estimator LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False). Check the list of available parameters with `estimator.get_params().keys()`.

In [51]:
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier

estimators = [LogisticRegression(),RandomForestClassifier(),GradientBoostingClassifier()]
param_grid1= {'penalty': ['l1', 'l2'],'C': np.logspace(0, 4, 10)}

param_grid2 = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}
param_grid3 = {
    'n_estimators': [200, 700],
    'max_features': ['auto', 'sqrt', 'log2']
}

param_list = [param_grid1,param_grid2,param_grid3]

for i in range(0,len(estimators)):
    
    clf = GridSearchCV(estimator=estimators[i], param_grid=param_list[i], cv= 5)
    
    #print (CV_rfc.best_estimator_)
    
    best_model = clf.fit(X_train,y_train)
    print(best_model.best_estimator_)
    

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
         

In [None]:

    clf = GridSearchCV(estimator=estimators[i], param_grid=param_list[i], cv= 5)
    
    #print (CV_rfc.best_estimator_)
    
    best_model = clf.fit(X_train,y_train)

In [37]:
# Load libraries
import numpy as np
from sklearn import datasets
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Set random seed
np.random.seed(

# Create a pipeline
pipe = Pipeline([('classifier', GradientBoostingRegressor())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [LogisticRegression()],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(0, 4, 10)},
                {'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]},
                {'classifier': [GradientBoostingRegressor()],
                 'classifier__n_estimators': [10, 100, 1000],
                 'classifier__max_features': [1, 2, 3]}
               ]

# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)

#Conduct Model Selection Using Grid Search

# Fit grid search
best_model = clf.fit(X_train, y_train)

# View best model
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=1, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)