## DECISION TREE CLASSIFIER

In [14]:
def decisionTree_clf(feature_columns,target_column,data_frame,os_thres,*args,**kwargs):
    '''
    The decisionTree_clf function is for executing a classifier model based on a certain amount of features of a dataset.
    This function has the purpose of returning a complete performance analysis of a certain model. The function will
    generate accuracy score, Jaccard score, recall score and precision score, ROC-AUC curves, classification report and
    confusion matrix of the executed model.
    The function parameters you should pass are:
    - feature_columns: list of str, default=None
        It is the list of features that the model will evaluate to predict the target or class
    - target_column: str, default=None
        It is the target variable or class to predict with the model
    - data_frame: str, default=None
        It is the name of the dataframe that the feature_columns and the target_column belong to.
    - os.thres: float, default=None
        Represents the threshold up to which the oversampling won't be needed given the ratio between the minority 
        and the majority class. The threshold is represented with a float between 0 and 1. 
    '''
    
    # Importing libraries
    import numpy as np                                   # for handling mathematical operations 
    import pandas as pd                                  # for building and handling dataframe operations
    import matplotlib.pyplot as plt                      # for generating plots
    import seaborn as sns                                # for generating plots
    from sklearn import tree                             # for building the models and generating the tree graph
    from imblearn.over_sampling import SMOTE             # for generating an oversampling dataset
    from sklearn.model_selection import train_test_split # for slicing the dataframe in testing and training
    from sklearn import metrics                          # for returning the performance metrics of the model
    from sklearn import preprocessing                    # for 
    
    def title(symbol,string,width):
        len_str = len(string)
        len_sym = width-len_str
        len_sym = int(len_sym/2)
        title = symbol*len_sym+string+symbol*len_sym
        print(title)
    
    # Defining target and features variables
    X = data_frame[feature_columns]   # Features
    y = data_frame[target_column]     # Target
    
    # Generating Oversampling
    # Generating Oversampling
    min_y=min(list(y.value_counts())) # Minoritary class
    max_y=max(list(y.value_counts())) # Majoritary class
    min_ratio=min_y/max_y             # ratio of the minoritary class by the majoritary class
    title('-','Target variable sampling ratio',124)
    if os_thres>min_ratio:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # 70% training and 30% test
        os = SMOTE(sampling_strategy='auto',random_state=0,k_neighbors=5)
        X_train, y_train = os.fit_resample(X_train, y_train)
        title('-','Oversampling generated with SMOTE technique',124)
        fig,ax=plt.subplots(1,2,figsize=(10,5))
        ax[0].bar(y.value_counts().index,y.value_counts().values,color=['darkorange','blue'],alpha=0.8)
        ax[0].set_ylabel('Count')
        ax[0].set_xlabel('Class')
        ax[0].set_title('Countplot of Each Class before oversampling\nwithin the Target Variable')
        ax[1].bar(y_train.value_counts().index,y_train.value_counts().values,color=['darkorange','blue'],alpha=0.8)
        ax[1].set_ylabel('Count')
        ax[1].set_xlabel('Class')
        ax[1].set_title('Countplot of Each Class after oversampling\nwithin the Target Variable')
        plt.show()
    else:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        title('-','No Oversampling generated',124)
        sns.countplot(y)
        plt.ylabel('Count')
        plt.xlabel('Class')
        plt.title('Countplot of Each Class \nwithin the Target Variable')
        plt.show()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) # 70% training and 30% test
    
    # Creating Decision Tree Classifier
    model='Decision Tree Classifier'
    clf = tree.DecisionTreeClassifier(random_state=0,*args,**kwargs)
    ## Training Decision Tree Classifer
    clf = clf.fit(X_train,y_train)
    ## Predicting the response for test dataset
    y_pred = clf.predict(X_test)
       
    # Generating metrics for the executed model
    title('-','Model Performance Metrics',124)
    print('Accuracy: %0.2f'%round(metrics.accuracy_score(y_test, y_pred),2))
    print('Jaccard: %0.2f'%round(metrics.jaccard_score(y_test, y_pred,pos_label=1),2))
    print('Precision: %0.2f'%round(metrics.precision_score(y_test, y_pred,pos_label=1),2))
    print('Recall: %0.2f'%round(metrics.recall_score(y_test, y_pred,pos_label=1),2))
    print('\n')

    # Generating classification report
    title('-','Classification Report',124)
    print(metrics.classification_report(y_test, y_pred))
    print('\n')
     
    # Generating Confusion Matrix
    title('-','Confusion Matrix',124)
    cm_t = metrics.confusion_matrix(y_test, y_pred)
    with sns.axes_style("white"):
        fig, ax = plt.subplots(figsize=(4,4))
        ax = sns.heatmap(cm_t, 
                        square=True,
                        annot=True,
                        fmt="d",
                        cbar=False)
        ax.xaxis.set_ticklabels(['Negative', 'Positive'])
        ax.yaxis.set_ticklabels(['Negative', 'Positive']);
        ax.set_xlabel('Predicted')
        ax.set_ylabel('True')
    plt.title('Confusion Matrix of the {m}'.format(m=model))
    plt.show()
    print('\n')
    
    # Generating and plotting ROC-AUC Curve
    title('-','ROC-AUC Curve',124)
    logit_roc_auc = metrics.roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred,pos_label=1)
    plt.figure()
    plt.plot(fpr, tpr, label='{m} (Area = %0.2f)'.format(m=model)%logit_roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    print('\n')
    
    # Calculating and plotting Cost Complexity Pruning Path
    title('-','Cost Complexity Pruning Path',124)
    path=clf.cost_complexity_pruning_path(X_train,y_train)
    ccp_alphas,impurities=path.ccp_alphas,path.impurities
    fig,ax=plt.subplots()
    ax.plot(ccp_alphas[:-1],impurities[:-1],marker='o',drawstyle='steps-post')
    plt.xticks(rotation=45, horizontalalignment='right')
    ax.set_xlabel('Effective Alpha')
    ax.set_ylabel('Total Impurity of Leaves')
    ax.set_title('Total Impurity vs Effective Alphas \n(calculated for the training set)')
    plt.show()
    print('\n')

    mods=[]
    for ccp_alpha in ccp_alphas:
        mod=tree.DecisionTreeClassifier(random_state=0,ccp_alpha=ccp_alpha)
        mod.fit(X_train,y_train)
        mods.append(mod)
    print('The number of nodes in the last tree is: \033[1m{}\033[0m with ccp_alpha = \033[1m{}\033[0m'
          .format(mods[-1].tree_.node_count,ccp_alphas[-1]))
    mods=mods[:-1]
    ccp_alphas=ccp_alphas[:-1]
    node_counts=[mod.tree_.node_count for mod in mods]
    depth=[mod.tree_.max_depth for mod in mods]
    fig,ax=plt.subplots(2,1)
    ax[0].plot(ccp_alphas,node_counts,marker='o',drawstyle='steps-post')
    ax[0].set_xlabel('Alpha')
    ax[0].set_ylabel('Number of Nodes')
    ax[0].set_title('Number of Nodes vs Alpha')
    ax[1].plot(ccp_alphas,depth,marker='o',drawstyle='steps-post')
    ax[1].set_xlabel('Alpha')
    ax[1].set_ylabel('Depth of Tree')
    ax[1].set_title('Depth vs Alpha')
    fig.tight_layout()
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.show()

    # Accuracy vs alpha for training and testing sets
    train_scores=[mod.score(X_train,y_train) for mod in mods]
    test_scores=[mod.score(X_test,y_test) for mod in mods]
    fig,ax=plt.subplots()
    ax.set_xlabel('Alpha')
    ax.set_ylabel('Accuracy')
    ax.set_title('Accuracy vs Alpha \ncalculated for training and testing sets')
    ax.plot(ccp_alphas,train_scores,marker='o',label='Training set',drawstyle='steps-post')
    ax.plot(ccp_alphas,test_scores,marker='o',label='Testing set',drawstyle='steps-post')
    ax.legend()
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.show()
    
    d={'Training Scores':train_scores, 'Testing Scores':test_scores,'Alphas':ccp_alphas}
    a=pd.DataFrame(d)
    print(a)
    print('\n')
    
    # Generating Decision Tree graph
    title('-','Decision Tree graph',124)
    with sns.axes_style():
        ax=plt.subplots(figsize=(20,15))
        tree.plot_tree(decision_tree=clf,
                        feature_names=feature_columns,
                        class_names=target_column,
                        fontsize=10,
                        impurity=True,
                        filled=True,
                        label='all')
        plt.title('{m} - Tree Graph'.format(m=model))
        plt.show()

## DECISION TREE REGRESSOR

In [1]:
def decisionTree_reg(feature_columns,target_column,data_frame,os_thres,*args,**kwargs):
    '''
    The decisionTree_reg function is for executing a regressor model based on a certain amount of features of a dataset.
    This function has the purpose of returning a complete performance analysis of a certain model. The function will
    generate R2 score, mean squared error, root mean squared error, mean absolute error, maximum error and
    ROC-AUC curves of the executed model.
    The function parameters you should pass are:
    - feature_columns: list of str, default=None
        It is the list of features that the model will evaluate to predict the target or class
    - target_column: str, default=None
        It is the target variable or class to predict with the model
    - data_frame: str, default=None
        It is the name of the dataframe that the feature_columns and the target_column belong to.
    - os.thres: float, default=None
        Represents the threshold up to which the oversampling won't be needed given the ratio between the minority 
        and the majority class. The threshold is represented with a float between 0 and 1. 
    '''

    # Importing libraries
    import numpy as np                                   # for handling mathematical operations
    import pandas as pd                                  # for building and handling dataframe operations
    import matplotlib.pyplot as plt                      # for generating plots
    import seaborn as sns                                # for generating plots
    from sklearn import tree                             # for building the models and generating the tree graph
    from imblearn.over_sampling import SMOTE             # for generating an oversampling dataset
    from sklearn.model_selection import train_test_split # for slicing the dataframe in testing and training
    from sklearn import metrics                          # for returning the performance metrics of the model
    from sklearn import preprocessing                    # for 
        
    def title(symbol,string,width):
        len_str = len(string)
        len_sym = width-len_str
        len_sym = int(len_sym/2)
        title = symbol*len_sym+string+symbol*len_sym
        print(title)
    
    # Defining target and features variables
    X = data_frame[feature_columns]   # Features
    y = data_frame[target_column]     # Target
    
    # Generating Oversampling
    min_y=min(list(y.value_counts())) # Minoritary class
    max_y=max(list(y.value_counts())) # Majoritary class
    min_ratio=min_y/max_y             # ratio of the minoritary class by the majoritary class
    title('-','Target variable sampling ratio',124)
    if os_thres>min_ratio:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # 70% training and 30% test
        os = SMOTE(sampling_strategy='auto',random_state=0,k_neighbors=5)
        X_train, y_train = os.fit_resample(X_train, y_train)
        title('-','Oversampling generated with SMOTE technique',124)
        fig,ax=plt.subplots(1,2,figsize=(10,5))
        ax[0].bar(y.value_counts().index,y.value_counts().values,color=['darkorange','blue'],alpha=0.8)
        ax[0].set_ylabel('Count')
        ax[0].set_xlabel('Class')
        ax[0].set_title('Countplot of Each Class before oversampling\nwithin the Target Variable')
        ax[1].bar(y_train.value_counts().index,y_train.value_counts().values,color=['darkorange','blue'],alpha=0.8)
        ax[1].set_ylabel('Count')
        ax[1].set_xlabel('Class')
        ax[1].set_title('Countplot of Each Class after oversampling\nwithin the Target Variable')
        plt.show()
    else:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        title('-','No Oversampling generated',124)
        sns.countplot(y)
        plt.ylabel('Count')
        plt.xlabel('Class')
        plt.title('Countplot of Each Class \nwithin the Target Variable')
        plt.show()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) # 70% training and 30% test
    
    ## Creating Decision Tree Regressor
    model = 'Decision Tree Regressor'
    reg = tree.DecisionTreeRegressor(random_state=0,*args,**kwargs)
    ## Training Decision Tree Regressor
    reg = reg.fit(X_train,y_train)
    ## Predicting the response for test dataset
    y_pred = reg.predict(X_test)
       
    # Generating metrics for the executed model
    title('-','Model Performance Metrics',124)
    print('R2: %0.2f'%round(metrics.r2_score(y_test, y_pred),2))
    print('Mean Squared Error: %0.2f'%round(metrics.mean_squared_error(y_test, y_pred),2))
    print('Root Mean Squared Error: %0.2f'%round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),2))
    print('Mean Absolute Error: %0.2f'%round(metrics.mean_absolute_error(y_test, y_pred),2))
    print('Maximum Error: %0.2f'%round(metrics.max_error(y_test, y_pred),2))
    print('\n')
    
    # Generating and plotting ROC-AUC Curve
    title('-','ROC-AUC Curve',124)
    logit_roc_auc = metrics.roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred,pos_label=1)
    plt.figure()
    plt.plot(fpr, tpr, label='{m} (Area = %0.2f)'.format(m=model)%logit_roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    print('\n')
    
    # Calculating and plotting Cost Complexity Pruning Path
    title('-','Cost Complexity Pruning Path',124)
    path=reg.cost_complexity_pruning_path(X_train,y_train)
    ccp_alphas,impurities=path.ccp_alphas,path.impurities
    fig,ax=plt.subplots()
    ax.plot(ccp_alphas[:-1],impurities[:-1],marker='o',drawstyle='steps-post')
    plt.xticks(rotation=45, horizontalalignment='right')
    ax.set_xlabel('Effective Alpha')
    ax.set_ylabel('Total Impurity of Leaves')
    ax.set_title('Total Impurity vs Effective Alphas \n(calculated for the training set)')
    plt.show()
    print('\n')

    mods=[]
    for ccp_alpha in ccp_alphas:
        mod=tree.DecisionTreeRegressor(random_state=0,ccp_alpha=ccp_alpha)
        mod.fit(X_train,y_train)
        mods.append(mod)
    print('The number of nodes in the last tree is: \033[1m{}\033[0m with ccp_alpha = \033[1m{}\033[0m'
          .format(mods[-1].tree_.node_count,ccp_alphas[-1]))
    mods=mods[:-1]
    ccp_alphas=ccp_alphas[:-1]
    node_counts=[mod.tree_.node_count for mod in mods]
    depth=[mod.tree_.max_depth for mod in mods]
    fig,ax=plt.subplots(2,1)
    ax[0].plot(ccp_alphas,node_counts,marker='o',drawstyle='steps-post')
    ax[0].set_xlabel('Alpha')
    ax[0].set_ylabel('Number of Nodes')
    ax[0].set_title('Number of Nodes vs Alpha')
    ax[1].plot(ccp_alphas,depth,marker='o',drawstyle='steps-post')
    ax[1].set_xlabel('Alpha')
    ax[1].set_ylabel('Depth of Tree')
    ax[1].set_title('Depth vs Alpha')
    fig.tight_layout()
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.show()

    # Accuracy vs alpha for training and testing sets
    train_scores=[mod.score(X_train,y_train) for mod in mods]
    test_scores=[mod.score(X_test,y_test) for mod in mods]
    fig,ax=plt.subplots()
    ax.set_xlabel('Alpha')
    ax.set_ylabel('Accuracy')
    ax.set_title('Accuracy vs Alpha \ncalculated for training and testing sets')
    ax.plot(ccp_alphas,train_scores,marker='o',label='Training set',drawstyle='steps-post')
    ax.plot(ccp_alphas,test_scores,marker='o',label='Testing set',drawstyle='steps-post')
    ax.legend()
    plt.xticks(rotation=45, horizontalalignment='right')
    plt.show()
    
    d={'Training Scores':train_scores, 'Testing Scores':test_scores,'Alphas':ccp_alphas}
    a=pd.DataFrame(d)
    print(a)
    print('\n')
    
    # Generating Decision Tree graph
    title('-','Decision Tree graph',124)
    with sns.axes_style():
        ax=plt.subplots(figsize=(20,15))
        tree.plot_tree(decision_tree=reg,
                        feature_names=feature_columns,
                        class_names=target_column,
                        fontsize=10,
                        impurity=True,
                        filled=True,
                        label='all')
        plt.title('{m} - Tree Graph'.format(m=model))
        plt.show()

## LOGISTIC REGRESSION

In [9]:
def logReg (feature_columns,target_column,data_frame,os_thres,n,*args,**kwargs):

    # Importing libraries
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from imblearn.over_sampling import SMOTE
    from sklearn import metrics
    from sklearn.feature_selection import RFE
    import matplotlib.pyplot as plt
    import seaborn as sns
    import statsmodels.api as sm
    
    def title(symbol,string,width):
        len_str = len(string)
        len_sym = width-len_str
        len_sym = int(len_sym/2)
        title = symbol*len_sym+string+symbol*len_sym
        print(title)
        
    # Defining target and features variables
    X = data_frame[feature_columns]   # Features
    y = data_frame[target_column]     # Target
    
    # Checking features to run the model
    lr=LogisticRegression(max_iter=10000,random_state=0,*args,**kwargs)
    rfe=RFE(lr,n_features_to_select=n)
    rfe=rfe.fit(X,y)
    support=rfe.support_
    ranking=rfe.ranking_
    z=zip(feature_columns,support,ranking)
    l=list(z)
    features=[]
    for i in range(len(l)):
        if l[i][2]==1:
            features.append(l[i][0])
    X = data_frame[features]
    
    # Generating Oversampling
    min_y=min(list(y.value_counts())) # Minoritary class
    max_y=max(list(y.value_counts())) # Majoritary class
    min_ratio=min_y/max_y             # ratio of the minoritary class by the majoritary class
    title('-','Target variable sampling ratio',124)
    if os_thres>min_ratio:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) # 70% training and 30% test
        os = SMOTE(sampling_strategy='auto',random_state=0,k_neighbors=5)
        X_train, y_train = os.fit_resample(X_train, y_train)
        title('-','Oversampling generated with SMOTE technique',124)
        fig,ax=plt.subplots(1,2,figsize=(10,5))
        ax[0].bar(y.value_counts().index,y.value_counts().values,color=['darkorange','blue'],alpha=0.8)
        ax[0].set_ylabel('Count')
        ax[0].set_xlabel('Class')
        ax[0].set_title('Countplot of Each Class before oversampling\nwithin the Target Variable')
        ax[1].bar(y_train.value_counts().index,y_train.value_counts().values,color=['darkorange','blue'],alpha=0.8)
        ax[1].set_ylabel('Count')
        ax[1].set_xlabel('Class')
        ax[1].set_title('Countplot of Each Class after oversampling\nwithin the Target Variable')
        plt.show()
    else:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        title('-','No Oversampling generated',124)
        sns.countplot(y)
        plt.ylabel('Count')
        plt.xlabel('Class')
        plt.title('Countplot of Each Class \nwithin the Target Variable')
        plt.show()
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) # 70% training and 30% test
    
    # Generating the Logistic Regression model
    model = 'Logistic Regression'
    sc = StandardScaler()
    X_train=sc.fit_transform(X_train)
    X_test=sc.fit_transform(X_test)
    clf = LogisticRegression(random_state=0,*args,**kwargs)
    clf = clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)

    # Generating metrics for the executed model
    title('-','Model Performance Metrics',124)
    print('Accuracy: %0.2f'%round(metrics.accuracy_score(y_test, y_pred),2))
    print('Jaccard: %0.2f'%round(metrics.jaccard_score(y_test, y_pred,pos_label=1),2))
    print('Precision: %0.2f'%round(metrics.precision_score(y_test, y_pred,pos_label=1),2))
    print('Recall: %0.2f'%round(metrics.recall_score(y_test, y_pred,pos_label=1),2))
    print('\n')

    # Generating classification report
    title('-','Classification Report',124)
    print(metrics.classification_report(y_test, y_pred))
    print('\n')
     
    # Generating Confusion Matrix
    title('-','Confusion Matrix',124)
    cm_t = metrics.confusion_matrix(y_test, y_pred)
    with sns.axes_style("white"):
        fig, ax = plt.subplots(figsize=(4,4))
        ax = sns.heatmap(cm_t, 
                        square=True,
                        annot=True,
                        fmt="d",
                        cbar=False)
        ax.xaxis.set_ticklabels(['Negative', 'Positive'])
        ax.yaxis.set_ticklabels(['Negative', 'Positive']);
        ax.set_xlabel('Predicted')
        ax.set_ylabel('True')
    plt.title('Confusion Matrix of the {m}'.format(m=model))
    plt.show()
    print('\n')
    
    # Generating and plotting ROC-AUC Curve
    title('-','ROC-AUC Curve',124)
    logit_roc_auc = metrics.roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred,pos_label=1)
    plt.figure()
    plt.plot(fpr, tpr, label='{m} (Area = %0.2f)'.format(m=model)%logit_roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.savefig('Log_ROC')
    plt.show()
    print('\n')

## RANDOM FOREST CLASSIFIER

In [11]:
def randomForest_clf(feature_columns, target_columns, data_frame,maxFeat,numEst,crit):
    
    # Importing libraries
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from imblearn.over_sampling import SMOTE
    from sklearn import metrics
    from sklearn.metrics import classification_report
    import matplotlib.pyplot as plt
    import seaborn as sns
    %matplotlib inline
    
    # Defining target and features variables
    X = data_frame[feature_columns]   # Features
    y = data_frame[target_column]     # Target

    # Generating Oversampling
    min_y=min(list(y.value_counts())) # Minoritary class
    max_y=max(list(y.value_counts())) # Majoritary class
    min_ratio=min_y/max_y             # ratio of the minoritary class by the majoritary class
    print('----------Target variable sampling ratio----------')
    sns.countplot(y)
    plt.ylabel('Count')
    plt.xlabel('Class')
    plt.title('Countplot of Each Class \nwithin the Target Variable')
    plt.show()
    if os_thres>min_ratio:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) # 70% training and 30% test
        os = SMOTE(sampling_strategy='auto',random_state=0,k_neighbors=5)
        X_train, y_train = os.fit_resample(X_train, y_train)
        print('----------Oversampling generated with SMOTE technique----------\n')
    else:
        print('Minority Class: '+str(min_y)+
              '\nMajority Class: '+str(max_y)+
              '\nMinority Proportion: '+str(round(min_ratio,2)))
        print('----------No Oversampling generated----------\n')
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30) # 70% training and 30% test

    # Create a Random Forest Classifier
    clf=RandomForestClassifier(random_state=0)
    ## Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(X_train,y_train)
    y_pred=clf.predict(X_test)

    feature_imp = pd.Series(clf.feature_importances_*100,index=features).sort_values(ascending=False)
    print(feature_imp)
    
    # Creating a bar plot
    sns.barplot(x=feature_imp, y=feature_imp.index)
    # Add labels to your graph
    plt.xlabel('Feature Importance Score')
    plt.ylabel('Features')
    plt.title("Visualizing Important Features")
    plt.legend()
    plt.show()
    # Generating metrics for the executed model
    print('----------Model Performance Metrics----------')
    print('Accuracy: %0.2f'%round(metrics.accuracy_score(y_test, y_pred),2))
    print('Jaccard: %0.2f'%round(metrics.jaccard_score(y_test, y_pred,pos_label=1),2))
    print('Precision: %0.2f'%round(metrics.precision_score(y_test, y_pred,pos_label=1),2))
    print('Recall: %0.2f'%round(metrics.recall_score(y_test, y_pred,pos_label=1),2))
    print('\n')

    # Generating classification report
    print('----------Classification Report----------')
    print(metrics.classification_report(y_test, y_pred))
    print('\n')
     
    # Generating Confusion Matrix
    print('----------Confusion Matrix----------')
    cm_t = metrics.confusion_matrix(y_test, y_pred)
    with sns.axes_style("white"):
        fig, ax = plt.subplots(figsize=(4,4))
        ax = sns.heatmap(cm_t, 
                        square=True,
                        annot=True,
                        fmt="d",
                        cbar=False)
        ax.xaxis.set_ticklabels(['Negative', 'Positive'])
        ax.yaxis.set_ticklabels(['Negative', 'Positive']);
        ax.set_xlabel('Predicted')
        ax.set_ylabel('True')
    plt.title('Confusion Matrix of the {m}'.format(m=model))
    plt.show()
    print('\n')

    # Generating and plotting ROC-AUC Curve
    print('----------ROC-AUC Curve----------')
    logit_roc_auc = metrics.roc_auc_score(y_test, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred,pos_label=1)
    plt.figure()
    plt.plot(fpr, tpr, label='{m} (Area = %0.2f)'.format(m=model)%logit_roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()
    print('\n')

## SUPPORT VECTOR MACHINES

In [None]:
def svm(feature_columns,target_column,data_frame,C=1.0,gamma=0.01,kernel='rbf'):
    
    # Importing libraries
    import pandas as pd
    import numpy as np
    from sklearn import svm,svr,svc
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    # Defining target and features variables
    X = data_frame[feature_columns]   # Features
    y = data_frame[target_column]     # Target
    
    x_min,x_max=x[:,0].min()-1,x[:,0].max()+1
    y_min,y_max=x[:,1].min()-1,x[:,1].max()+1
    h=(x_max-x_min)/100

    xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))
    
    x_plot=np.c_[xx.ravel(),yy.ravel()]
    
    svc=svm.SVC(kernel=kernel,C=C,gamma=gamma,decision_function_shape='ovr').fit(x,y)
    y_pred=svc.predict(x_plot)
    y_pred=y_pred.reshape(xx.shape)
    plt.figure(figsize=(16,9))
    plt.contourf(xx,yy,y_pred,cmap=plt.cm.tab10,alpha=0.3)
    plt.scatter(x[:,0],x[:,1],c=y,cmap=plt.cm.tab10)
    plt.xlabel('Longitud de los Pétalos')
    plt.ylabel('Anchura de los pétalos')
    plt.xlim(xx.min(),xx.max())
    plt.title('SVC para las flores de Iris con Kernel '+kernel)
    plt.show()

from ipywidgets import interact, fixed
interact(svm_iris,C=[0.01,0.1,1,10,100,1000,1e6,1e10],
         gamma=[1e-4,1e-3,1e-2,0.1,0.2,0.5,0.99],kernel=['rbf','linear','poly','sigmoid'])

## CONVOLUTIONAL NEURAL NETWORK

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

import numpy as np
import datetime
import tensorflow as tf
from tensorflow.keras.datasets import cifar10
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Input, Activation, BatchNormalization
from tensorflow.keras.optimizers import SGD, Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import categorical_crossentropy
import matplotlib.pyplot as plt

# Limpiamos el directorio de logs
!rm -rf ./logs/ 

# Definimos hiperparametros
batch_size = 100
num_classes = 10
epochs = 20

# Cargamos el dataset y separamos en datos de entrenamiento y en datos de testing
(X_train, y_train), (X_test, y_test) = cifar10.load_data()

# Asignamos las filas, columnas y canales de la base de imagenes CIFAR10
_ ,filas, columnas, canales = X_test.shape
print(xt.shape)

# Para entrenar es necesario pasar los valores a float32
X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

# Normalizamos para que queden en el rango de 0....1
X_train = X_train/255
X_test = X_test/255

# Las etiquetas las pasamos a valores categoricos one-hot
#convertir las clases pues no se puede poner un numero entero a secas -> formato Categorico -> crea array de 10 posiciones, es decir, de 1 a 10 con 1s y 0s
#ejemplo: si la etiqueta es un bird -----> [0 0 1 0 0 0 0 0 0 0]  la funcion Categorical convierte el 1 al formato de array (on-hot encoding)
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)

opcion = 2

if(opcion==1):
    Entradas = Input(shape=(filas,columnas,canales))
    x = Conv2D(64, kernel_size=(3,3), activation='relu')(Entradas)
    x = Conv2D(128, kernel_size=(3,3), activation='relu')(x)
    x = MaxPooling2D(pool_size=(2,2))(x)
  
    x=Flatten()(x)
    x=Dense(10,activation='relu')(x)
    x=Dense(num_classes, activation='softmax')(x)

if(opcion==2):
    Entradas=Input(shape=(filas,columnas,canales))
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(Entradas)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)
    x = Dropout(0.25)(x)

    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    #x = Conv2D(128,kernel_size=(3,3))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(128,kernel_size=(3,3))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)
    x = Dropout(0.25)(x)
 
    x = Conv2D(256, (3, 3), activation='relu', padding='same')(x)
    #x = Conv2D(256,kernel_size=(3,3))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Conv2D(256,kernel_size=(3,3))(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = MaxPooling2D((2, 2), strides=(2, 2))(x)
    x = Dropout(0.25)(x)
  
    x = Flatten()(x)
    x = Dense(10,activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(num_classes, activation='softmax')(x)

modelo = Model(inputs=Entradas, outputs=x)
modelo.summary()
optimizador = Adam(lr=0.001, beta_1=0.9, beta_2=0.9)  #SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
modelo.compile(loss = categorical_crossentropy, optimizer = optimizador, metrics=['categorical_accuracy'])

# Preparamos para visualizar el resultado en el TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir = log_dir, histogram_freq=1)

modelo.fit(X_train,
           y_train, 
           batch_size = batch_size, 
           epochs = epochs, 
           verbose=1, 
           validation_data = (X_test, y_test), 
           callbacks=[tensorboard_callback])

puntuacion = modelo.evaluate(X_test, y_test, verbose=1)
print(puntuacion)

%tensorboard --logdir logs/fit
# https://www.tensorflow.org/tensorboard/get_started

## AUTOENCODER

In [None]:
# set the matplotlib backend so figures can be saved in the background
import matplotlib
matplotlib.use("Agg")

# import the necessary packages
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Conv2DTranspose
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.datasets import mnist
from tensorflow.keras import backend as K
import numpy as np
import matplotlib.pyplot as plt
import cv2



class ConvAutoencoder:
    @staticmethod
    def build(width, height, depth, filters=(32, 64), latentDim=16):

      # ------------------------------ENCODER + LATENT SPACE----------------------------------------------

      # initialize the input shape to be "channels last" along with
      # the channels dimension itself
      # channels dimension itself
      inputShape = (height, width, depth)
      chanDim = -1

      # define the input to the encoder
      inputs = Input(shape=inputShape)
      x = inputs

      # loop over the number of filters
      for f in filters:
        # apply a CONV => RELU => BN operation
        x = Conv2D(f, (3, 3), strides=2, padding="same")(x)
        x = LeakyReLU(alpha=0.2)(x)
        x = BatchNormalization(axis=chanDim)(x)

      # flatten the network and then construct our latent vector
      volumeSize = K.int_shape(x)
      x = Flatten()(x)
      latent = Dense(latentDim)(x)

      # build the encoder model
      encoder = Model(inputs, latent, name="encoder")

      # ------------------------------LATENT SPACE + DECODER----------------------------------------------

      # start building the decoder model which will accept the
      # output of the encoder as its inputs
      latentInputs = Input(shape=(latentDim,))
      x = Dense(np.prod(volumeSize[1:]))(latentInputs)
      x = Reshape((volumeSize[1], volumeSize[2], volumeSize[3]))(x)
      # loop over our number of filters again, but this time in
      # reverse order

      for f in filters[::-1]:
        # apply a CONV_TRANSPOSE => RELU => BN operation
        x = Conv2DTranspose(f, (3, 3), strides=2,	padding="same")(x)
        x = LeakyReLU(alpha=0.2)(x)
        x = BatchNormalization(axis=chanDim)(x)

      # apply a single CONV_TRANSPOSE layer used to recover the
      # original depth of the image
      x = Conv2DTranspose(depth, (3, 3), padding="same")(x)
      outputs = Activation("sigmoid")(x)

      # build the decoder model
      decoder = Model(latentInputs, outputs, name="decoder")

      # our autoencoder is the encoder + decoder
      autoencoder = Model(inputs, decoder(encoder(inputs)), name="autoencoder")

      # return a 3-tuple of the encoder, decoder, and autoencoder
      return (encoder, decoder, autoencoder)
    
    

# initialize the number of epochs to train for and batch size
EPOCHS = 15
BS = 32

# load the MNIST dataset
print("[INFO] loading MNIST dataset...")
((trainX, _), (testX, _)) = mnist.load_data()

# add a channel dimension to every image in the dataset, then scale
# the pixel intensities to the range [0, 1]
trainX = np.expand_dims(trainX, axis=-1)
testX = np.expand_dims(testX, axis=-1)

trainX = trainX.astype("float32") / 255.0
testX = testX.astype("float32") / 255.0

# construct our convolutional autoencoder
print("[INFO] building autoencoder...")
(encoder, decoder, autoencoder) = ConvAutoencoder.build(28, 28, 1)
opt = Adam(lr=1e-3)
autoencoder.compile(loss="mse", optimizer=opt)

# train the convolutional autoencoder
H = autoencoder.fit(trainX, trainX, validation_data=(testX, testX), epochs=EPOCHS, batch_size=BS)


print(encoder.summary())
print(decoder.summary())
print(autoencoder.summary())





# construct a plot that plots and saves the training history
N = np.arange(0, EPOCHS)
plt.style.use("ggplot")
plt.figure()
plt.plot(N, H.history["loss"], label="train_loss")
plt.plot(N, H.history["val_loss"], label="val_loss")
plt.title("Training Loss and Accuracy")
plt.xlabel("Epoch #")
plt.ylabel("Loss/Accuracy")
plt.legend(loc="lower left")
plt.savefig("plot.png")

from google.colab.patches import cv2_imshow

# use the convolutional autoencoder to make predictions on the
# testing images, then initialize our list of output images
print("[INFO] making predictions...")
decoded = autoencoder.predict(testX)
outputs = None
samples = 10

# loop over our number of output samples
for i in range(0, samples):
	# grab the original image and reconstructed image
	original = (testX[i] * 255).astype("uint8")
	recon = (decoded[i] * 255).astype("uint8")
 
	# stack the original and reconstructed image side-by-side
	output = np.hstack([original, recon])
 
	# if the outputs array is empty, initialize it as the current
	# side-by-side image display
	if outputs is None:
		outputs = output
	# otherwise, vertically stack the outputs
	else:
		outputs = np.vstack([outputs, output])
  
# save the outputs image to disk
cv2.imwrite("output.png", outputs)



## GAN

In [None]:
from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Input, Dense, Reshape, Flatten, Dropout
from tensorflow.keras.layers import BatchNormalization, Activation, ZeroPadding2D, LeakyReLU, UpSampling2D, Conv2D
#from tensorflow.keras.layers.advanced_activations import LeakyReLU
#from tensorflow.keras.layers.convolutional import UpSampling2D, Conv2D
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import sys
import numpy as np




class GAN():
    def __init__(self):
        self.img_rows = 28
        self.img_cols = 28
        self.channels = 1
        self.img_shape = (self.img_rows, self.img_cols, self.channels)
        self.latent_dim = 100

        optimizer = Adam(0.0002, 0.5)

        # Build and compile the discriminator
        self.discriminator = self.build_discriminator()
        self.discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Build the generator
        self.generator = self.build_generator()

        # The generator takes noise as input and generates imgs
        z = Input(shape=(self.latent_dim,))
        img = self.generator(z)

        # For the combined model we will only train the generator
        self.discriminator.trainable = False

        # The discriminator takes generated images as input and determines validity
        validity = self.discriminator(img)

        # The combined model  (stacked generator and discriminator)
        # Trains the generator to fool the discriminator
        self.combined = Model(z, validity)
        self.combined.compile(loss='binary_crossentropy', optimizer=optimizer)


    def build_generator(self):

        model = Sequential()

        model.add(Dense(256, input_dim=self.latent_dim))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(1024))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(np.prod(self.img_shape), activation='tanh'))
        model.add(Reshape(self.img_shape))

        model.summary()

        noise = Input(shape=(self.latent_dim,))
        img = model(noise)

        return Model(noise, img)

        

    def build_discriminator(self):

        model = Sequential()

        model.add(Flatten(input_shape=self.img_shape))
        model.add(Dense(512))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(256))
        model.add(LeakyReLU(alpha=0.2))
        model.add(Dense(1, activation='sigmoid'))
        model.summary()

        img = Input(shape=self.img_shape)
        validity = model(img)

        return Model(img, validity)



    def train(self, epochs, batch_size=128, sample_interval=50):

        # Load the dataset
        (X_train, _), (_, _) = mnist.load_data()

        # Rescale -1 to 1
        X_train = X_train / 127.5 - 1.
        X_train = np.expand_dims(X_train, axis=3)

        # Adversarial ground truths
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        for epoch in range(epochs):

            # ---------------------
            #  Train Discriminator
            # ---------------------

            # Select a random batch of images
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            imgs = X_train[idx]

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # Generate a batch of new images
            gen_imgs = self.generator.predict(noise)

            # Train the discriminator
            d_loss_real = self.discriminator.train_on_batch(imgs, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_imgs, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # ---------------------
            #  Train Generator
            # ---------------------

            noise = np.random.normal(0, 1, (batch_size, self.latent_dim))

            # Train the generator (to have the discriminator label samples as valid)
            g_loss = self.combined.train_on_batch(noise, valid)

            # Plot the progress
            print ("%d [D loss: %f, acc.: %.2f%%] [G loss: %f]" % (epoch, d_loss[0], 100*d_loss[1], g_loss))

            # If at save interval => save generated image samples
            if epoch % sample_interval == 0:
                self.sample_images(epoch)



    def sample_images(self, epoch):
        r, c = 5, 5
        noise = np.random.normal(0, 1, (r * c, self.latent_dim))
        gen_imgs = self.generator.predict(noise)

        # Rescale images 0 - 1
        gen_imgs = 0.5 * gen_imgs + 0.5

        fig, axs = plt.subplots(r, c)
        cnt = 0
        for i in range(r):
            for j in range(c):
                axs[i,j].imshow(gen_imgs[cnt, :,:,0], cmap='gray')
                axs[i,j].axis('off')
                cnt += 1
        
        im = '/content/gdrive/My Drive/ganimages/'
        fig.savefig(im + "%d.png" % epoch)
        plt.close()


if __name__ == '__main__':
    gan = GAN()
    gan.train(epochs=10000, batch_size=32, sample_interval=500)

## RNN-LSTM

In [None]:
import numpy as np
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Input, Flatten, MaxPooling1D, Dense, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence,text
from tensorflow.keras import Model
import matplotlib.pylab as plt

#Variables de entrada
batch_size = 64
epochs = 3 #30
#Embedding. Capa especial que se utiliza en procesado de texto (por lo general). Asigna a las palabras una vectorización densa y única
tamano_embedding = 32
#Máximo de palabras caracteristicas que va a tener nuestra BD
maximas_palabras = 5000
#Longitud máxima de los comentarios que vamos a tener en nuestra BD
maxima_longitud = 500

#Cargado de bases  de datos
(xentrenamiento, yentrenamiento),(xtest, ytest) = imdb.load_data(num_words=maximas_palabras)

#recortamos las secuencias a la longitud maxima_longitud. Tanto en entrenamiento como en test
xentrenamiento = sequence.pad_sequences(xentrenamiento, maxlen=maxima_longitud)
xtest = sequence.pad_sequences(xtest, maxlen=maxima_longitud)

#ya está vectorizado! 
# la BD imdb lo trae vectorizado! sino, deberiamos vectorizar las palabras usando la libreria Sklearn u otra
print(xentrenamiento[:10])
print(yentrenamiento[:10])



#Creacion de modelo
entrada = Input(shape=(maxima_longitud, ))
x = Embedding(maximas_palabras, tamano_embedding)(entrada)
#x = Dropout(0.2)(x)
#x = LSTM(100, return_sequences=True, activation='relu')(x)
x = LSTM(100, dropout=0.2, recurrent_dropout=0.2)(x)
#x = Dropout(0.2)(x)
#x = Flatten()(x)
x = Dense(1, activation="sigmoid")(x)

modelo = Model(inputs=entrada, outputs=x)
modelo.compile(loss='binary_crossentropy',
               optimizer='adam',
               metrics=['binary_accuracy'])
modelo.summary()

#Entrenamiento
#Usamos la callback para guardar el mejor modelo. Mejor modelo que armó en las épocas
checkpoint = ModelCheckpoint('comentarios.h5',
                             monitor='val_binary_accuracy',
                             verbose=1,
                             save_best_only=True,
                             save_weights_only=False,
                             mode='auto')
history=modelo.fit(xentrenamiento, 
                   yentrenamiento, 
                   batch_size=batch_size, 
                   epochs=epochs, 
                   callbacks=[checkpoint], 
                   validation_data=(xtest,ytest), 
                   verbose=1)

# Evaluamos el modelo
scores = modelo.evaluate(xtest, ytest, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))


#visualizacion de resultaoos
plt.figure(1)
plt.plot(history.history['val_loss'])
plt.plot(history.history['loss'])
plt.title('Perdidas del Modelo')
plt.ylabel('Perdidas')
plt.xlabel('Epocas')
plt.legend(['Test','Entrenamiento'], loc='upper left')

plt.figure(2)
plt.plot(history.history['val_binary_accuracy'])
plt.plot(history.history['binary_accuracy'])
plt.title('Precision del Modelo')
plt.ylabel('Precision')
plt.xlabel('Epocas')
plt.legend(['Test','Entrenamiento'], loc='upper left')



modelo.load_weights('comentarios.h5')
print(xentrenamiento[:5])
print(yentrenamiento[:5])
predicciones = modelo.predict(xentrenamiento[:5],verbose=1)
print(predicciones)

#COMENTARIO NEGATIVO (definimos como < 0.5)
res_pred = predicciones[predicciones<0.5]
print(res_pred)


#COMENTARIO POSITIVO (definimos como >= 0.5)
res_pred = predicciones[predicciones>=0.5]
print(res_pred)

