# Data Processing

In [186]:
#Imports
import numpy as np
import pandas as pd
pd.options.mode.use_inf_as_null = True
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

In [148]:
def descriptor_target_split(file):
    """
    Split the input data into descriptors and target DataFrames
        Parameters:
            file: pandas.DataFrame
                Input DataFrame containing descriptors and target data.
        Returns:
            descriptors: pandas.DataFrame
                Descriptors DataFrame.
            target: pandas.DataFrame
                Target DataFrame.
    """
    target = file.loc[:, file.columns == 'Target']
    descriptors = file.loc[:, file.columns != 'Target']
    return descriptors, target

In [167]:
def descriptor_target_join(descriptors,target):
    """
    Merge the Descriptors and Target DataFrames
        Parameters:
            descriptors: pandas.DataFrame
                Descriptors DataFrame.
            target: pandas.DataFrame
                Target DataFrame.
        Returns:
            file: pandas.DataFrame
                Input DataFrame containing descriptors and target data.
    """

    descriptors['Target'] = target['Target']
    file = descriptors
    return file

In [140]:
def missing_value_imputation(file,missing_value_type = "NaN",strategy = "mean", axis = 0):
    """
    Imputes placeholder missing values in data
        Parameters:
            file: {array-like, sparse matrix}
                Sample vectors which may have have missing values
            missing_value_type:  string, optional (default="NaN")
                Placeholder for missing value. If none is given, "NaN" will be used.
            strategy = string, optional (default="mean")
                Strategy for replacing missing values. It must be one of "mean", "median", or "mode". If none is given,
                "mean" is used
            axis = int, optional (default=0)
                Imputations along rows or columns. It must be one of 0 (for columns) or 1 (for rows)
        Returns:
            file: {array-like, sparse matrix}
    """
    file.replace(missing_value_type,np.nan,inplace = True)
    #Replacing None and np.nan with the given strategy
    if axis == 0:
        if strategy == "mean":
            for i in list(file.columns):
                file[i].fillna(file[i].mean(), inplace=True)
        if strategy == "median":
            for i in list(file.columns):
                file[i].fillna(file[i].median(), inplace=True)
        if strategy == "mode":
            for i in list(file.columns):
                file[i].fillna(file[i].mode(), inplace=True)
    
    elif axis == 1:
        if strategy == "mean":
                file = file.T.fillna(file.mean(axis=1)).T
        if strategy == "median":
                file = file.T.fillna(file.median(axis=1)).T
        if strategy == "mode":
                file = file.T.fillna(file.mode(axis=1)).T
        
    else:
        print('Axis value incorrect')
        return
    
    if file.isnull().sum().sum() == 0:
        return file
    else:
        print('Missing values present')
        return file

In [141]:
def remove_low_variance_features(file,threshold_value = 0.01):
    """
    Feature selector that removes all low-variance features.
        Parameters:
            file: pandas.DataFrame
                Input Data from which to compute variances.
            threshold : float, optional
                Features with a training-set variance lower than this threshold will be removed. 
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    column_list = list(file.columns)
    selector = VarianceThreshold(threshold_value)
    transformed_arrays = selector.fit_transform(file)
    transformed_columns_list = [column_list[i] for i in selector.get_support(indices = True)]
    file = pd.DataFrame(transformed_arrays,columns = transformed_columns_list)
    return file

In [146]:
def remove_high_correlated_features(file,threshold_value):
    """
    Feature selector that removes all highly-correlated features.
        Parameters:
            file: pandas.DataFrame
                Input DataFrame to remove highly correlated features.
            threshold : float, optional
                Features with a correlation higher than this threshold will be removed. 
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    return

In [168]:
def univariate_feature_selection(file,k_value = 10,score_function = "f_regression"):
    """
    Univariate feature selection works by selecting the best features based on univariate statistical tests. 
    Selects features according to the k highest scores.
        Parameters:
            file: pandas.DataFrame
                Input DataFrame to perform univariate feature selection.
            k_value: int, optional, default=10
                Number of top features to select.
            score_function: string, optional, default="f_regression"
                Scoring function that return scores and pvalues. It must be one of "f_regression" or "mutual_info_regression". 
                If none is given, "f_regression" is used
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    if score_function == "f_regression":
        from sklearn.feature_selection import f_regression
        selector = SelectKBest(f_regression,k_value)
    elif score_function == "mutual_info_regression":
        from sklearn.feature_selection import mutual_info_regression
        selector = SelectKBest(mutual_info_regression,k_value)
    descriptors, target = descriptor_target_split(file)
    column_list = list(descriptors.columns)
    transformed_arrays = selector.fit_transform(descriptors,target)
    transformed_columns_list = [column_list[i] for i in selector.get_support(indices = True)]
    file = pd.DataFrame(transformed_arrays,columns = transformed_columns_list)
    file = descriptor_target_join(file,target)
    return file

In [None]:
def tree_based_feature_selection(file,n_estimators_value = 10,max_features_value = None,threshold_value = "mean"):
    """
    Feature selection using a tree-based estimator to compute feature importances, which in turn can be used 
    to discard irrelevant features
        Parameters:
            file: pandas.DataFrame
                Input DataFrame to perform tree based feature selection.
            n_estimators: int, optional, default=10
                Number of trees in the forest.
            max_features_value: {int, float, string}, optional, default=None
                The number of features to consider when looking for the best split.
                If int, then consider max_features_value features at each split.
                If float, then max_features_value is a percentage and int(max_features_value*n_features) features are 
                considered at each split.
                If "auto", then max_features_value=sqrt(n_features)
                If "sqrt", then max_features_value=sqrt(n_features)
                If "log2", then max_features_value=log2(n_features)
                If None, then max_features_value=n_features
            threshold_value: {int, string}, optional, default="mean"
                The threshold value to use for feature selection. Features whose importance is greater or equal are kept while 
                the others are discarded. It must be one of "1.25*mean", "median", "1e-5" or "0.001". If none is given, 
                "mean" is used
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.    
    """
    descriptors, target = descriptor_target_split(file)
    column_list = list(descriptors.columns)
    clf = ExtraTreesClassifier(n_estimators = n_estimators_value, max_features = max_features_value)
    clf = clf.fit(descriptors, target)
    model = SelectFromModel(clf, prefit=True,threshold=threshold_value)
    transformed_arrays = model.transform(descriptors)
    transformed_columns_list = [column_list[i] for i in model.get_support(indices = True)]
    file = pd.DataFrame(transformed_arrays,columns = transformed_columns_list)
    file = descriptor_target_join(file,target)
    return file

In [184]:
def rfe_feature_selection(file, step_value = 1, max_features_value = 3):
    """
    Select features by recursively considering smaller and smaller sets of features. 
        Parameters:
            file: pandas.DataFrame
                Input DataFrame to perform RFE based feature selection.
            step_value: int, optional, default=1
                If greater than or equal to 1, then step corresponds to the (integer) number of features to remove at each 
                iteration. If within (0.0, 1.0), then step corresponds to the percentage (rounded down) of features to 
                remove at each iteration.
            max_features_value: int, optional, default=3
                Number of trees in the forest.
        Returns:
            file: {array-like, sparse matrix}
                Transformed array.
    """
    return

In [210]:
def test_train_split(file, test_size_value = 0.25, train_size_value = None):
    """
    Split Input DataFrame into Training and Testing Data
        Parameters:
            file: pandas.DataFrame
                Input DataFrame containing descriptors and target data.
            test_size : float, int, default=0.25
                If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the 
                test split. If int, represents the absolute number of test samples. If None, the value is set to the 
                complement of the train size.
            train_size : float, int, or None, default None
                If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the 
                train split. If int, represents the absolute number of train samples. If None, the value is automatically 
                set to the complement of the test size.
        Returns:
            train: pandas.DataFrame
                DataFrame containing training data.
            test: pandas.DataFrame
                DataFrame containing testing data.
    """
    descriptors, target = descriptor_target_split(file)
    X_train, X_test, y_train, y_test = train_test_split(descriptors, target, test_size=test_size_value, train_size=train_size_value)
    train = descriptor_target_join(X_train,y_train)
    train.reset_index(inplace = True)
    train.drop('index',axis = 1,inplace = True)
    test = descriptor_target_join(X_test,y_test)
    test.reset_index(inplace = True)
    test.drop('index',axis = 1,inplace = True)
    return train,test