# Machine Learning Supervised workflow

## Content:

- [Model selection](#Model-selection)
- [Importing libraries](#Importing-libraries)
- [Data Cleaning](#Data-Cleaning)
- [Data preprocessing](#Data-preprocessing)
- [Text preprocessing](#Text-preprocessing)
- [Resampling](#Resampling)
- [Models Regressions](#Models-Regressions)
	- [Baseline Model](#Baseline-Model)
	- [Linear Regression](#Linear-Regression)
	- [Lasso](#Lasso)
	- [Ridge Regression](#Ridge-Regression)
	- [ElasticNet Regression](#ElasticNet-Regression)
	- [Metrics regression](#Metrics-regression)
- [Models Classifications](#Models-Classifications)
	- [Baseline model](#Baseline-model)
	- [Classifier](#Classifier)
	- [Classifier for text](#Classifier-for-text)
	- [Classifiers evaluation function](#Classifiers-evaluation-function)
	- [Feature Importance](#Feature-Importance)
	- [Metrics Classifiers](#Metrics-Classifiers)
- [Pipeline and GridSearch](#Pipeline-and-GridSearch)

## Model selection

<img src="https://scikit-learn.org/stable/_static/ml_map.png" >

## Importing libraries

In [0]:
# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt

# preprocessing data
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score, train_test_split

# preprocessing text
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim.parsing import preprocessing
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# regressors
from sklearn.linear_model import LinearRegressionRidge, RidgeCV
from sklearn.linear_model import Lasso, LassoCV
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.linear_model import ElasticNet, ElasticNetCV
# metrics
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

# classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier 
from sklearn.ensemble import VotingClassifier, AdaBoostClassifier, BaggingClassifier
# metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,roc_auc_score

# other
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV

# resampling
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek

In [0]:
df = pd.read_csv("")
df.isnull().sum()

In [0]:
df.dtypes

## Data Cleaning

In [0]:
# Inputation of medians
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
imputer.fit(df_features)

In [0]:
# Creating polynomial features
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures()
poly.fit_transform(X)

## Data preprocessing

In [0]:
# One hot encoding categorical variables
from sklearn.preprocessing import OneHotEncoder

categorical_feature = df.select_dtypes()
cat_encoder = OneHotEncoder()
cat_encoder.fit_transform(df[categorical_feature])

# Get dummies

df = pd.get_dummies(data=df,columns=['column1'])

In [0]:
# Creating X - features variables and y - target variable for train and test dataset
features = [col for col in df_train.columns if col !='target']
X = df_train[features]
y = df_train['target']
X_test = df_test[features]

In [0]:
# Standardizing train and test features
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_train_scaled = ss.fit_transform(X) # for train used fit_transform
X_test_scaled = ss.transform(X_test) # for test transform only

In [0]:
# Splitting scaled train dataset for further verification of model
from sklearn.preprocessing import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_scaled,y)

## Text preprocessing

In [0]:
# Creating a function for t ext preprocessing
def text_preprocessing(df,columns_list,is_lem=True,is_stem=True):
    '''
    Lemmatize, Stemming list of words and concatenates in one string
    
    Takes:
        df - DataFrame
        columns_list - (list if str) - list with column' names with list of words
        is_lem=True - (bool) - activate WordNetLemmatizer
        is_stem=True - (bool) - activate PorterStemmer
    
    Returns:
        DataFrame with concatenated list of words
    '''
    lemmatizer = WordNetLemmatizer()
    p_stemmer = PorterStemmer()
    
    
    for column in columns_list:
        # removing tegs
        df[column] = df[column].map(lambda x: BeautifulSoup(text, "lxml").text)
        # removing non-letters
        df[column] = df[column].map(lambda x:re.sub("[^a-zA-Z]", " ", x))
        # Instantiating Tokenizer and setting a pattern to only words
        # applying Tokenizer to texts
        tokenizer = RegexpTokenizer(r'\w+')
        df[column] = df[column].map(lambda x: tokenizer.tokenize(x.lower()))
        if is_lem:
            df[column] = df[column].apply(lambda row: [lemmatizer.lemmatize(text)
                                     for text in row])
        if is_stem:
            df[column] = df[column].apply(lambda row: [p_stemmer.stem(text)
                                     for text in row])
        df[column] = df[column].apply(lambda row: ' '.join(word for word in row))
    return df

In [0]:
# Concatenating list of words in strings row-wise, 
# disabling Lemmatizer and PorterStemmer for better readability of words, 
# number of words in posts is not high so Lemmatizer and PorterStemmer are not necessary
df = text_preprocessing(df,['text'],is_lem=False,is_stem=False)

In [0]:
# Mapping binary target
df['target'] = df['target'].map({'yes':1,'no':0})

In [0]:
# Custom stop words
all_stop_words = set(list(stopwords.words('english')) + list(preprocessing.STOPWORDS) + 
                    ['https','www'])

In [0]:
# Initializing two vectorizers
# maximum number of words is 5000, used custom stopwords
cvec = CountVectorizer(analyzer = "word",
                             tokenizer = None,
                             preprocessor = None,
                             stop_words = all_stop_words,
                             max_features = 5000)
# CountVectorizer
train_features_cvec = cvec.fit_transform(X_train)
print(train_features_cvec.shape)

In [0]:
# Vectorizing words for train dataset: fit the model and learn train vocabulary 
# and transforming strings info feature vectors
tvec = TfidfVectorizer(stop_words = all_stop_words,
                             max_features = 5000)
# TfidfVectorizer
train_features_tvec = tvec.fit_transform(X_train)
print(train_features_tvec.shape)

## Resampling

In [0]:
# Creating function for resamling
def resampling_dataset(how,X_train,y_train,on= True):
    """
    Resamples imbalanced dataset
    
    Takes:
    how (str) : 'under', 'over', 'smote'
    x - df - with features
    y - series - with target
    on - on/of function
    Returns:
    X,y
    """
    if on:
        if how == 'under':
            X = pd.concat([X_train, y_train],axis=1)
            true_virus = X[X['wnvpresent']==1]
            false_virus = X[X['wnvpresent']==0]
            false_virus_downsampled = resample(false_virus,
                                    replace = False, # sample without replacement
                                    n_samples = len(true_virus), # match minority n
                                    random_state = 42) # reproducible results

            # combine minority and downsampled majority
            downsampled = pd.concat([false_virus_downsampled, true_virus])
            features = [col for col in downsampled if col !='wnvpresent']
            X_unders = downsampled[features]
            y_unders = downsampled['wnvpresent']
            return X_unders, y_unders
        elif how == 'over':
            sm = SMOTE(sampling_strategy='minority',random_state=42)
            X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
            oversempling_smote = pd.concat([X_train_sm, y_train_sm],axis=1)
            return X_train_sm, y_train_sm
        else:
            smt = SMOTETomek(sampling_strategy='all')
            X_smt, y_smt = smt.fit_sample(X_train, y_train)
            return X_smt, y_smt

In [0]:
resampling_dataset('over',X,y,on=True)

## Models Regressions

### Baseline Model

In [0]:
# Baseline model for Regression - mean
from sklearn.metrics import r2_score, mean_squared_error

yhat = [np.mean(y) for i in range(len(y))]
test_rmse = np.sqrt(mean_squared_error(y_true=y, y_pred=yhat))
test_r2 = r2_score(y_true=y, y_pred=yhat)
print('--- Baseline model scores ---')
print('Root mean squared error RMSE:', test_rmse)
print('R2:', test_r2)

### Linear Regression

In [0]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train_sc,y_train)
print("Coefficients", lr.coef_)
predictions  =  lr.predict(X_test_sc)
print('Score:',lr.score(X_test_sc,y_test))

### Lasso

In [0]:
from sklearn.linear_model import Lasso, LassoCV

lasso_alpha = np.arange(0.001,0.15,0.0025)
lasso_model = LassoCV(alphas=lasso_alpha,cv=5)
lasso_model.fit(X,y)
opt_alpha = lasso_model.alpha_
lasso_optimal_model = Lasso(alpha=opt_alpha)
lasso_optimal_model.fit(X,y)
predictions = lasso_optimal_model.predict(X_test)
#  all 0 is usless columns
lasso_optimal_model.coef_

### Ridge Regression

In [0]:
alpha = 10
ridge_model = Ridge(alpha = 10)
r_alpha = np.logspace(0,5,200)
ridge_model = RidgeCV(alphas = r_alpha,store_cv_values=True)
ridge_model.fit(X,y)
ridge_optimal_alpha = ridge_model.alpha_
ridge_optimal = Ridge(alpha=ridge_optimal_alpha)
print(cross_val_score(ridge_optimal,X,y).mean())
ridge_optimal.fit(X,y)
predictions = ridge_optimal.predict(X_test)
ridge_optimal.coef_

### ElasticNet Regression

In [0]:
from sklearn.linear_model import ElasticNet, ElasticNetCV

enet_alpha = np.arange(0.1,1,0.05)
ent_lratio = 0.5 # 50% of Lasso, 50% of Ridge
enet_model = ElasticNetCV(alphas=enet_alpha,l1_ratio=ent_lratio,cv=5)
enet_model.fit(X_overfit,y)
enet_optimal_alpha = enet_model.alpha_
lasso_optimal_model = Lasso(alpha=opt_alpha)
lasso_optimal_model.fit(X,y)
predictions = lasso_optimal_model.predict(X_test)

### Metrics regression

In [0]:
# Function for metrics
def metrics_function(y,predictions,num_features=len(features)):
    # Mean Absolute Error
    mae = mean_absolute_error(y,predictions)
    # Residual Sum of Squares
    rss = ((y-predictions)**2).sum()
    # Mean Squared Error
    mse = mean_squared_error(y,predictions)
    # Root Mean Squared Error
    rmse = mse**0.5
    # Coefficient of Determination
    r = r2_score(y,predictions)
    # Adjusted R2
    def r2_adj(y,predictions,num_features):
        r_adj = 1 - (1-r)*(len(y)-1)/(len(y)-num_features-1)
        return r_adj
    r_adj = r2_adj(y,predictions,num_features)
    print(f'Mean Absolute Error: {mae}')
    print(f'Residual Sum of Squares: {rss}')
    print(f'Mean Squared Error: {mse}')
    print(f'Root Mean Squared Error: {rmse}')
    print(f'Coefficient of Determination R2: {r}')
    print(f'Adjusted R2: {r_adj}')

## Models Classifications

### Baseline model

In [0]:
y_test.value_counts(normalize=True)

### Classifier

In [0]:
# Logistic Regression
cross_scores = cross_val_score(lr,X_train,y_train)
lr.fit(X_train,y_train)
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)
predictions = lr.predict(X_test)
predictions = lr.predict_proba(X_test)

### Classifier for text

In [0]:
# TfidfVectorizer with MultinomialNB
tvec = TfidfVectorizer(stop_words = all_stop_words)
nb = MultinomialNB()
X_train_tv = tvec.fit_transform(X_train)
X_test_tv = tvec.transform(X_test)
cross_scores = cross_val_score(nb,X_train_tv.todense(),y_train)
print(f'Cross_val_scores: {[round(i,3) for i in cross_scores]}')
nb.fit(X_train_tv.todense(),y_train)
train_score = nb.score(X_train_tv.todense(), y_train)
print(f'Train score: {round(train_score,3)}')
test_score = nb.score(X_test_tv.todense(), y_test)
print(f'Test score: {round(test_score,3)}')
predictions = nb.predict(X_test_tv)
model_metrics(y_test,predictions)

In [0]:
# TfidfVectorizer with KNN
tvec_knn = TfidfVectorizer(best_param,stop_words = all_stop_words)
knn_model = KNeighborsClassifier(n_neighbors=10,jobs=-1)
X_train_tv_knn = tvec_knn.fit_transform(X_train)
X_test_tv_knn = tvec_knn.transform(X_test)
cross_scores = cross_val_score(knn_model,X_train_tv_knn,y_train)
print(f'Cross_val_scores: {[round(i,3) for i in cross_scores]}')
knn_model.fit(X_train_tv_knn,y_train)
train_score = knn_model.score(X_train_tv_knn, y_train)
print(f'Train score: {round(train_score,3)}')
test_score = knn_model.score(X_test_tv_knn, y_test)
print(f'Test score: {round(test_score,3)}')
predictions = knn_model.predict(X_test_tv_knn)
model_metrics(y_test,predictions)

In [0]:
# Creating a dataframe with list of true values and predicted probabilities based on our model
pred_proba = [i[1] for i in lr.predict_proba(X_test_tv)]
pred_df = pd.DataFrame({'true_values': y_test,
                        'pred_probs':pred_proba})
pred_df.head()

### Classifiers evaluation function

In [0]:
# Creating function for simple models evaluation
def sample_evaluation(X,y,models,names_samples,X_test=X_test,y_test=y_test,on= True):
    if on:
        for i in range(len(X)):
            ss = StandardScaler()
            X_train_scaled = ss.fit_transform(X[i])
            X_test_scaled = ss.transform(X_test)
            for model in models:
                cv_scores = cross_val_score(model,X_train_scaled,y[i])
                model.fit(X_train_scaled,y[i])
                train_score = model.score(X_train_scaled,y[i])
                test_score = model.score(X_test_scaled,y_test)
                print(names_samples[i])
                print(str(model).split('(')[0])
                print('CV',cv_scores)
                print('train',train_score)
                print('test',test_score)
                # set of predicted labels match the corresponding set of true labels
                y_predicted = model.predict(X_test_scaled)
                try:
                    pred_proba = [i[1] for i in model.predict_proba(X_test_scaled)]
                    pred_df = pd.DataFrame({'true_values': y_test,
                                'pred_probs':pred_proba})
                    print(f'ROC score {round(roc_auc_score(pred_df["true_values"], pred_df["pred_probs"]),3)}')
                except:
                    pass
                accuracy = accuracy_score(y_test, y_predicted)
                # ratio tp / (tp + fp)
                precision = precision_score(y_test, y_predicted)             
                # ratio tp / (tp + fn)
                recall = recall_score(y_test, y_predicted)
                # weighted average of the precision and recall
                # F1 = 2 * (precision * recall) / (precision + recall)
                f1 = f1_score(y_test, y_predicted)
                tn, fp, fn, tp = confusion_matrix(y_test, y_predicted).ravel()
                print("True Negatives: %s" % tn)
                print("False Positives: %s" % fp)
                print("False Negatives: %s" % fn)
                print("True Positives: %s" % tp)
                print(f'accuracy {round(accuracy,3)}, precision {round(precision,3)},recall {round(recall,3)}, f1 {round(f1,3)}')
                print('==================')

In [0]:
# Change on to True to run a function
lr = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()
dtree = DecisionTreeClassifier()
randtree = RandomForestClassifier()
sample_evaluation([X_unders, X_train_sm, X_smt],[y_unders, y_train_sm,y_smt],
                  models=[lr,knn,svm,dtree,randtree,],
                  names_samples=['undersampling','oversampling','over-undersampling'],
                 on= False)

In [0]:
# Creating function for models evaluation with gridsearch parameters
def sample_evaluation_grid(X,y,models,params,names_samples,X_test=X_test,y_test=y_test,on=True):
    """
    Use different resample and different ML models to evaluate perfomance with gridsearch
    Takes:
    X - list of resampled X
    y - list of resampled y
    params - list of hyperparameters for gridsearch
    name_samples - list of str - name of resampling technique
    
    Prints accoracy score on train and test data
    """
    if on:
        for i in range(len(X)):
            X_train = X[i]
            y_train = y[i]
            ss = StandardScaler()
            X_train_scaled = ss.fit_transform(X_train)
            X_test_scaled = ss.transform(X_test)
            for j, model in enumerate(models):
                grid = GridSearchCV(model,param_grid=params[j])
                cv_scores = cross_val_score(grid,X_train_scaled,y_train)
                grid.fit(X_train_scaled,y_train)
                best_model = grid.best_estimator_
                best_model.fit(X_train_scaled,y_train)
                train_score = best_model.score(X_train_scaled,y_train)
                test_score = best_model.score(X_test_scaled,y_test)
                print(names_samples[i])
                print(str(model).split('(')[0])
                print(grid.best_params_)
                y_predicted = best_model.predict(X_test_scaled)
                try:
                    pred_proba = [i[1] for i in best_model.predict_proba(X_test_scaled)]
                    pred_df = pd.DataFrame({'true_values': y_test,
                                'pred_probs':pred_proba})
                    print(f'ROC score {round(roc_auc_score(pred_df["true_values"], pred_df["pred_probs"]),3)}')
                except:
                    pass
                print('CV',cv_scores)
                print('train',train_score)
                print('test',test_score)
                # set of predicted labels match the corresponding set of true labels
                accuracy = accuracy_score(y_test, y_predicted)
                # ratio tp / (tp + fp)
                precision = precision_score(y_test, y_predicted)             
                # ratio tp / (tp + fn)
                recall = recall_score(y_test, y_predicted)
                # weighted average of the precision and recall
                # F1 = 2 * (precision * recall) / (precision + recall)
                f1 = f1_score(y_test, y_predicted)
                tn, fp, fn, tp = confusion_matrix(y_test, y_predicted).ravel()
                print("True Negatives: %s" % tn)
                print("False Positives: %s" % fp)
                print("False Negatives: %s" % fn)
                print("True Positives: %s" % tp)
                print(f'accuracy {round(accuracy,3)}, precision {round(precision,3)},recall {round(recall,3)}, f1 {round(f1,3)}')
                print('==================')

In [0]:
lr = LogisticRegression(n_jobs=-1)
knn = KNeighborsClassifier(n_jobs=-1)
svm = SVC()
dtree = DecisionTreeClassifier()
randtree = RandomForestClassifier(n_jobs=-1)

# Change on to True to run a function
sample_evaluation_grid([X],[y],
models=[lr,knn,svm,dtree,randtree,],names_samples=['oversampling'],
params = [{'penalty':['l2','none'],'tol':[0.0001,0.001],'max_iter':[500,700]},
         {'n_neighbors':[3,5,7], 'weights':['uniform','distance']},
         {'degree':[1,3,6], 'C':[0.1,0.3,1]},
        {'max_depth':[None,2,4],'min_samples_leaf':[1,2,3]},
        {'n_estimators':[80,100,120],'max_depth':[None,2,4],'min_samples_leaf':[1,2,3]}],
                       on= True)

In [0]:
gradb = GradientBoostingClassifier()
bag = BaggingClassifier()
ada = AdaBoostClassifier(RandomForestClassifier(max_depth=None,min_samples_leaf=1))
sample_evaluation([X],[y],models=[gradb],names_samples=['oversampling'],on=True)
sample_evaluation_grid([X],[y],models=[bag,ada],names_samples=['oversampling'],params = [
             {'base_estimator':[KNeighborsClassifier(n_jobs=-1,n_neighbors=7,weights='distance'),
                                RandomForestClassifier(max_depth=None,min_samples_leaf=1)]},
             {'base_estimator':[DecisionTreeClassifier(),
                                RandomForestClassifier(max_depth=None,min_samples_leaf=1)]}
             ],on= True)

In [0]:
# Voting Classifier
# Evaluation of ensambles of models
X,y = resampling_dataset('over',X,y,on= True)

# Creation a voting classifier with best models
vc = VotingClassifier(estimators=[('knn',KNeighborsClassifier(weights='distance')),
                                  ('ada',AdaBoostClassifier(base_estimator=DecisionTreeClassifier())),
                                  ('bagg',BaggingClassifier(base_estimator=RandomForestClassifier(min_samples_leaf=2))),
                                  ('rfor',RandomForestClassifier()),('grad',GradientBoostingClassifier())
                                 ],voting='soft')
# Using gridsearch for better evaluation
on= True
if on:
    X_train = X
    y_train = y
    ss = StandardScaler()
    X_train_scaled = ss.fit_transform(X_train)
    X_test_scaled = ss.transform(X_test)
    
    vc.fit(X_train_scaled,y_train)
    print(vc.score(X_train_scaled,y_train))
    print(vc.score(X_test_scaled,y_test))
    # Evaluation
    # set of predicted labels match the corresponding set of true labels
    y_predicted = vc.predict(X_test_scaled)
    accuracy = accuracy_score(y_test, y_predicted)
    # ratio tp / (tp + fp)
    precision = precision_score(y_test, y_predicted)             
    # ratio tp / (tp + fn)
    recall = recall_score(y_test, y_predicted)
    # weighted average of the precision and recall
    # F1 = 2 * (precision * recall) / (precision + recall)
    f1 = f1_score(y_test, y_predicted)
    tn, fp, fn, tp = confusion_matrix(y_test, y_predicted).ravel()
    print("True Negatives: %s" % tn)
    print("False Positives: %s" % fp)
    print("False Negatives: %s" % fn)
    print("True Positives: %s" % tp)
    print(f'accuracy {round(accuracy,3)}, precision {round(precision,3)},recall {round(recall,3)}, f1 {round(f1,3)}')

### Feature Importance

In [0]:
fig,ax = plt.subplots(2,1)
np.ravel(ax)
for i,model in enumerate([dtree,randtree]):
    feat_importances = pd.Series(model.feature_importances_, index=X_test.columns)
    feat_importances.nlargest(10).plot(kind='barh', figsize=(15,15),ax=ax[i])
    model_name = str(model).split('(')[0]
    ax[i].set_title(f'10 Most important features of {model_name}',fontsize=18)
    for tick in ax[i].yaxis.get_major_ticks():
        tick.label.set_fontsize(15)

### Metrics Classifiers

In [0]:
def model_metrics(y_test, y_predicted):  
    """
    Calculates accuracy, precision, recall, f1
    
    Takes:
    y_test - pandas Series
    y_predicted - pandas Series
    
    Prints accuracy, precision, recall, f1
    
    Returns:
    None
    """
    # set of predicted labels match the corresponding set of true labels
    accuracy = accuracy_score(y_test, y_predicted)
    # ratio tp / (tp + fp)
    precision = precision_score(y_test, y_predicted)             
    # ratio tp / (tp + fn)
    recall = recall_score(y_test, y_predicted)
    # weighted average of the precision and recall
    # F1 = 2 * (precision * recall) / (precision + recall)
    f1 = f1_score(y_test, y_predicted)
    roc = roc_auc_score(y_test, y_predicted)
    print(f'ROC score {round(roc),3}')
    print(f'accuracy {round(accuracy,3)}, precision {round(precision,3)},recall {round(recall,3)}, f1 {round(f1,3)}')

In [0]:
# Confusion matrix
disp = plot_confusion_matrix(vc,X_test_scaled,y_test,normalize='true',
                             display_labels=['Yes','No'])
disp.ax_.set_title('Confusion Matrix');

In [0]:
# Creating distribution of divided probability
def div_prob(pred_proba,y_test=y_test):

    pred_df = pd.DataFrame({'true_values': y_test,
                            'pred_probs':pred_proba})
    plt.figure(figsize = (10,7))
    # Create two histograms of observations.
    plt.hist(pred_df[pred_df['true_values'] == 0]['pred_probs'],
             bins=25,
             color='#65a8a7',
             alpha = 0.5,
             label='WNVirus not present')
    plt.hist(pred_df[pred_df['true_values'] == 1]['pred_probs'],
             bins=25,
             color='#fcba03',
             alpha = 0.5,
             label='WNVirus present')

    # Add vertical line at P(Outcome = 1) = 0.5.
    plt.vlines(x=0.5,
               ymin = 0,
               ymax = 65,
               color='r',
               linestyle = '--')

    # Label axes.
    plt.title('Distribution of Probability', fontsize=22)
    plt.ylabel('Frequency', fontsize=18)
    plt.xlabel('Predicted Probability that Outcome = 1', fontsize=18)
    plt.text(y = 40,x = 0.1,s = 'True negative',color='blue')
    plt.text(y = 10,x = 0.25,s = 'False negative',color='orange')
    plt.text(y = 10,x = 0.55,s = 'False positive',color='blue')
    plt.text(y = 40,x = 0.75,s = 'True positive',color='orange')
    # Create legend.
    plt.legend(fontsize=20,loc='upper center');

In [0]:
# Evaluation
pred_proba = [i[1] for i in vc.predict_proba(X_test_scaled)]
div_prob(pred_proba)

In [0]:
# Creating Receiver Operating Characteristic (ROC) Curve
def roc_curve(pred_proba,y_test=y_test):
    
    pred_df = pd.DataFrame({'true_values': y_test,
                            'pred_probs':pred_proba})
    plt.figure(figsize = (10,7))

    # Create threshold values. (Dashed red line in image.)
    thresholds = np.linspace(0, 1, 200)

    # Define function to calculate sensitivity. (True positive rate.)
    def TPR(df, true_col, pred_prob_col, threshold):
        true_positive = df[(df[true_col] == 1) & (df[pred_prob_col] >= threshold)].shape[0]
        false_negative = df[(df[true_col] == 1) & (df[pred_prob_col] < threshold)].shape[0]
        return true_positive / (true_positive + false_negative)


    # Define function to calculate 1 - specificity. (False positive rate.)
    def FPR(df, true_col, pred_prob_col, threshold):
        true_negative = df[(df[true_col] == 0) & (df[pred_prob_col] <= threshold)].shape[0]
        false_positive = df[(df[true_col] == 0) & (df[pred_prob_col] > threshold)].shape[0]
        return 1 - (true_negative / (true_negative + false_positive))

    # Calculate sensitivity & 1-specificity for each threshold between 0 and 1.
    tpr_values = [TPR(pred_df, 'true_values', 'pred_probs', prob) for prob in thresholds]
    fpr_values = [FPR(pred_df, 'true_values', 'pred_probs', prob) for prob in thresholds]

    # Plot ROC curve.
    plt.plot(fpr_values, # False Positive Rate on X-axis
             tpr_values, # True Positive Rate on Y-axis
             label='ROC Curve')

    # Plot baseline. (Perfect overlap between the two populations.)
    plt.plot(np.linspace(0, 1, 200),
             np.linspace(0, 1, 200),
             label='baseline',
             linestyle='--')

    # Label axes.
    plt.title(f'ROC Curve with AUC = {round(roc_auc_score(pred_df["true_values"], pred_df["pred_probs"]),3)}', fontsize=22)
    plt.ylabel('Recall', fontsize=18)
    plt.xlabel('1 - Specificity', fontsize=18)

    # Create legend.
    plt.legend(fontsize=16);

In [0]:
roc_curve(pred_proba)

## Pipeline and GridSearch

In [0]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 6 (2×3) combinations of hyperparameters
    {'n_estimators': [5, 10], 'max_features': [6, 8, 10]}]

forest_reg = RandomForestRegressor(random_state=42)
# train across 3 folds, that's a total of 6*3=18 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=3,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)
grid_search.fit(housing_prepared, housing_labels)