In [1]:
# Retrives fucntions to detailed information about the data
%run BasicDataExploration.ipynb

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay 

In [2]:
def initialCleanup(dataframe):
    '''
        Some Basic Cleanup including:
            Removing the spaces from columns
            Converting the datatypes that are object to strings
            lowercae all character and remove space on the left and right side of the text
    '''
    dataframe = cleanColumnHeaders(dataframe)
    dataframe = fixColumnTypes(dataframe)
    return dataframe


In [3]:
def createDataframesForModel(dataframe, column_list):
    '''
        From an inital dataframe create mulitple dataframes for gender ( male and female)
    '''
    df_revision = dataframe[column_list].copy(deep=True)
    df_revision.Gender = df_revision.Gender.str.strip().str.lower()
    df_female = df_revision[ df_revision['Gender'] == 'f']
    df_male   = df_revision[ df_revision['Gender'] == 'm' ]
    return (df_female, df_male)

In [4]:
def createTestTrainData(dataframe, test_size):
    '''
        Create the Train/Test dataset for one dataframe
        Stratify is being used to avoid inbalancing dataset
        
        Inputs: 
            dataFrame -- A dataframe with features and target
            test_size -- The size ( 0 < x <= 1)
        Output 
            The XTrain and yTrain, the YTrain and YTest
    '''
    (XTrain, XTest, yTrain, yTest) = train_test_split(
        dataframe.drop('SARCOPENIA', axis=1), 
        dataframe['SARCOPENIA'], 
        test_size=test_size, 
        train_size=1-test_size)
       
    return (XTrain, XTest, yTrain, yTest)

In [5]:
def showBalanceOfDataset(columns,*targetSeries):
    '''
        Creates a side effect of plotting the of rows haveing disease/not having disease
        
        Inputs:
            columns -- A title for eaach dataset
            targetSeries -- The collection of the datasets.  One for each item in the column list
            
        Output:
            None
    '''
    figure = plt.figure(figsize=(10,5))

    columns = ["Train Male", "Test Male", "Train Female", "Test Female"] 
    X_axis = np.arange(len(columns))
    
    disease_yes = []
    disease_no = []
    for series in targetSeries:
        raw_data = series.groupby(series).count()
        disease_no.append(raw_data[0])
        disease_yes.append(raw_data[1])
    
    plt.bar(X_axis - 0.2, disease_no, 0.4, label = 'No Disease')
    plt.bar(X_axis + 0.2, disease_yes,0.4, label = 'Disease')
    plt.xticks(X_axis, columns)
    plt.ylabel("Counts")
    plt.xlabel("Data Set")
    plt.legend()

In [6]:
def cleanDataSet(dataframe):
    column_list = dataframe.select_dtypes(include='string[python]').columns
    for column in column_list:
        dataframe[column] = dataframe[column].str.strip().str.lower()
    return dataframe
    

In [7]:
def cleanEducation(dataframe, field):
    '''
        Changes the data in the columns to make it easier to convert to categorical data
        
        input
            dataframe -- The frame with the education field
            field -- The name of the education field
            
        output 
            A dataframe with a cleaned Education Field
    '''
    # Constants -- The condensed set of strings
    illerate="illiterate"
    university="university"
    high_school="high_school"
    junior_high="junior_high"
    elementry="elementry"

    # Convert the data in the columns to easilty convert to categorical data 
    translation_education = {}
    translation_education["illiterate"] = illerate
    translation_education["illeterate"] = illerate
    translation_education["i̇lliterate"] = illerate
    translation_education["ılliterate"] = illerate
    translation_education["okur-yzar değil"] = illerate
    translation_education["university"] = university
    translation_education["üniversite"] = university
    translation_education["high school"] = high_school
    translation_education["secondary school"] = high_school
    translation_education["high_school"] = high_school
    translation_education["highschool"] = high_school
    translation_education["lise"] = high_school
    translation_education["primary_school"] = elementry
    translation_education["ilkokul"] = elementry
    translation_education["middle school"] = junior_high
    translation_education["junior high"] = junior_high
    translation_education["junior_high"] = junior_high
    translation_education["ortaokul"] = junior_high

                      
    def fix(value):
        if pd.isnull(value):
            return value
        else:             
            new_value = translation_education.get(value)
            if new_value == None: 
                new_value = value
        return new_value
                      
    dataframe[field] = dataframe[field].apply(fix)
    return dataframe


In [8]:
def createGraph(rows,cols):
    (figure, axis) = plt.subplots(rows,cols, figsize=(15,5))
    reval_ax = np.ravel(axis)
    return figure,reval_ax

In [9]:
def createHeatMap(dataframe, index):
    numerical_dataframe = df_male.select_dtypes(include=np.number)
    correlation = numerical_dataframe.corr()
    sns.heatmap(correlation, mask=np.triu(correlation), annot=True, ax=ax[index])

In [10]:
def trainTestScore(pipeLine, parameters, cv,train,test):
    '''
        Run the pipleline and generated the prdection for the features

            pipeline -- The preprocessing and machine learning algoirthm used in the experiment
            cv       -- Number running for each different permuation of the parameters
            train    -- The features and target of the training set
            test     -- The features and target of the test dataset

        Output
            The pipline used in the test and array of predictions computer from feature test set.
    '''
    grid_pipeline = GridSearchCV(pipeline, param_grid=parameters, cv=cv, return_train_score=True, verbose=3, error_score='raise')
    grid_pipeline.fit(train[0], train[1])
    print(test[0].shape)
    y_predictions = grid_pipeline.predict(test[0])
    grid_pipeline.score(test[0], test[1])

    return grid_pipeline, y_predictions

In [11]:
    def feature(grid):
        '''
            Create a bar graph shown the effect the features have on the target

            Input:
                grid -- The Grid containng the model, feature names
            Output:
                Noen
            Side3 Effect 
                Display a graph showing a bar graph of how the featrues affect the target
        '''
        
        machine_learning_model = grid.best_estimator_.steps[1][1]
        coefficients = machine_learning_model.coef_[0].squeeze()
        names = grid.best_estimator_.named_steps["Column Transformers"].get_feature_names_out()

        features = pd.DataFrame(columns=["labels", "values"])
        features["labels"] = names
        features["values"] = coefficients
        features.plot.barh(x="labels", y="values", legend=False)

In [17]:
from IPython.display import display, HTML

def results(grid, step_name, x, y, y_prediction, results_dataframe,filename):
    '''
        For each experiment displays the basic scores, accuracy and the confusion matrixes

        Accuracy -- Number of correct preidictions( True Positive + True Negative ) / Total Number of prediction 
        Precison -- True Positive / True Positive + False Positive
        Recall   -- True Positive / True Positive + False Negative -- Proportion of actual positives was indentified correctly
        F1 -- Relies on both precision and recall so it represents both in one metric
            F1 = 2 / (1/recall) + (1/precison) == ( 2 * precision * recall) / ( precision + recall ) ==

            ( 2 * True Positive ) / ( 2 * True positive + False Positive + False Negative )

        Note : Usually improving Precision harms Recall and improving recall harms Precision

        Roc Curve -- The performance of a classification thresholds with two parameters : True Postive , Fals Positive Rate

            TPR = True Positive / ( True Positive )
            FPR = False Positive / ( False Positive + True Negative )
            Lowering the classification threshold classifies more items item as postiive thus increasing the False Positive, True Postive

        AUC is the Area under the Roc Curve which is the measure of performance across all possible classification thresholds
        

        input:
            grid -- The preprocessing and machine learning algoirthm used in the experiment
            step_name -- The name of the step that does the machine learning algorithm
            x -- The dataset contianing the testing features
            y -- The dataset containing the target
            y -- The dataset of targets generated by compuing the target value from the independent values
            results_dataframe -- The dataframe where the resules are stored so we can use them in futre runs.  
                In order to get of invaldi run delete them from file
            filename -- The fileanme for the storage of results_dataframe
    '''
    def auc(y, y_prediction):
        '''
            Calcuate area under an ROC Curve

            Inputs:
                y -- The dataset containing the target
                y -- The dataset of targets generated by compuing the target value from the independent values

            Output 
                Area Under the ROC Curve
        '''
        false_positive_rates, true_positive_rate, thresholds = roc_curve(y, y_prediction,pos_label=2)
        return auc(false_positive_rates, true_positive_rate)

    training_score = np.round(grid.cv_results_['mean_train_score'][0],2)
    test_score = np.round(grid.cv_results_['mean_test_score'][0],2)
    accuracy =  np.round(accuracy_score(y_true = y, y_pred = y_prediction),2)
    precision = np.round(precision_score(y, y_pred=y_prediction, zero_division=1),2)
    recall = np.round(recall_score(y, y_pred=y_prediction),2)
    
    print("The Training Score is ", training_score)
    print("The Test     Score is ", test_score)
    print("Accuracy     Score is ", accuracy)
    print("Precision    Score is ", precision)
    print("Recall       Score is ", recall)
    print("best               is ", grid.best_params_)
    
    dataframe = addResultsToDataFrame(results_dataframe, 
                                grid.best_params_,
                                training_score,
                                test_score,
                                accuracy,
                                precision,
                                recall, 
                                filename)
                            
    text_confusion_matrix = confusion_matrix(y, y_pred=y_prediction)
    display = ConfusionMatrixDisplay(confusion_matrix=text_confusion_matrix, display_labels=['Disease', 'No Disease', ])
    display.plot()
    
    RocCurveDisplay.from_predictions(y, y_pred=y_prediction)
    plt.show()

In [13]:
    def feature_random_forest(grid):
        '''
            Create a bar graph shown the effect the features have on the target for random forest

            Input:
                grid -- The Grid containng the model, feature names.  This will be a random forest
            Output:
                Noen
            Side3 Effect 
                Display a graph showing a bar graph of how the featrues affect the target
        '''

        machine_learning_model = grid.best_estimator_.steps[1][1]
        print("model = ", type(machine_learning_model))
        coefficients = machine_learning_model.feature_importances_.squeeze()
        names = grid.best_estimator_.named_steps["Column Transformers"].get_feature_names_out()

        features = pd.DataFrame(columns=["labels", "values"])
        features["labels"] = names
        features["values"] = coefficients
        features.plot.barh(x="labels", y="values", legend=False)

In [14]:
def getResultsDataFrame(filename):
    '''
        Purpose get the dataframe to add run to

        input: 
            filename -- Name of file
        output 
            if ( new is true ) then a new dataframe else dataframe
    ''' 
    try:
        dataframe = pd.read_csv(filename)
        data_frame = pd.read_csv(filename)
    except FileNotFoundError:
        columns = ["Best Parameters", "Training Score", "Testing Score", "Accuracy", "Precision", "Recall"]
        dataframe = pd.DataFrame(columns=columns)

    return dataframe
        

In [15]:
def addResultsToDataFrame(df, bestParameters, trainingScore, testScore, accuracy, precision, recall, filename):
    '''
        Adds a row to the dataframe and save to a file 

        Input:
            df -- The dataframe to append the data
            bestparameters -- The bestParmameters for that current run
            trainingSocre -- The socre of the trainging data
            testScore -- The score of the test data
            accuracy -- Number of True Positives vs False Postivies 
            precision -- Number of Correct Conditions vs false positives
            recall -- Proportion of actual positives was indentified correctly
    '''
    data = [ bestParameters, trainingScore, testScore, accuracy, precision, recall ]
    df.loc[ len(df) ] = data
    df.to_csv(filename, index=False)

    return df

In [16]:
def getDataFrame(filename):
    return pd.read_csv(filename)