# Python Functions created from scratch

Used in the "*California Housing - Github Code*" Python Notebook.

## Data Cleaning

In [None]:
# Remove missing values from the whole dataset

def remove_null(dataset):
    
    # Loop over the features
    for i in dataset:
        
        # Skip features with no missing values
        if dataset[i].isnull().sum().sum() == 0:
            continue
        
        # Drop the rows with missing values
        else:
            nan_rows = dataset[dataset[i].isnull()]
            dataset = dataset.drop(nan_rows.index, axis=0)
    
    # Update the dataset
    dataset = pd.DataFrame.reset_index(dataset, drop=True)
    
    return(dataset)

## Data Pre-processing

In [None]:
# Class to apply the three transformation methods to the features
# +++ Dummies should not be transformed, they are recognized by the algorithm by min = 0 and max = 1 +++

class features_transformation(): 
      
    def __init__(self, scale_transform): 
                
        self.scale_transform = scale_transform
        
          
    # Transform the features of the training set       
    def fit(self, features):
        
        # Save the columns name of the DataFrame
        features_col = features.columns
        
        # Transform the DataFrame in matrix
        features = np.matrix(features)
        
        # Apply standardization
        if self.scale_transform == 'standardization':
            mean_par = np.mean(features, axis=0)
            std_par = np.std(features, axis=0)
            
            for i in range(0, std_par.shape[1]):

                # Do NOT standardize the dummies
                if np.min(features[:, i]) == 0.0 and np.max(features[:, i]) == 1.0:
                    mean_par[0, i] = 0.0
                    std_par[0, i] = 1.0
            
            new_features = np.divide((features - mean_par), std_par)
            new_features = pd.DataFrame(new_features, columns = features_col)
            
            # Save standardization parameters (mean, standard deviation) of the training features
            self.mean_par = mean_par
            self.std_par = std_par
            
        # Apply Normalization
        elif self.scale_transform == 'normalization':
            min_par = np.min(features, axis=0)
            max_par = np.max(features, axis=0)
            
            for a in range(0, max_par.shape[1]):
                
                # Do NOT normalize the dummies already in [0, 1]
                if np.min(features[:, a]) == 0.0 and np.max(features[:, a]) == 1.0:
                    max_par[0, a] = 1.0
                    min_par[0, a] = 0.0
            
            new_features = np.divide((features - min_par), (max_par - min_par))
            new_features = pd.DataFrame(new_features, columns = features_col)
            
            # Save normalization parameters (min, max) of the training features
            self.min_par = min_par
            self.max_par = max_par
        
        # Applying Unit-length Scaling
        elif self.scale_transform == 'unit_length':
            unit_norm = np.zeros((1, features.shape[1]))
            
            for p in range(0, features.shape[1]):
                
                unit_norm[0, p] = LA.norm(features[:, p], ord = 1, axis = 0)
                
                # Do NOT scale the dummies
                if np.min(features[:, p]) == 0.0 and np.max(features[:, p]) == 1.0:
                    unit_norm[0, p] = 1.0
            
            new_features = np.divide(features, unit_norm)
            new_features = pd.DataFrame(new_features, columns = features_col)
            
            # Save unit-length scaling parameters (norm) of the training features
            self.unit_norm = unit_norm
            
        # No transformation of the traning features is required
        else:
            new_features = features
            
        return(new_features)
    
    
    # Transform the features of the test set with previous transformation parameters
    def test_transform(self, test_features):
        
        test_features_col = test_features.columns
        
        # Transform the DataFrame in matrix
        test_features = np.matrix(test_features)
        
        # Apply standardization
        if self.scale_transform == 'standardization':
            new_test_features = np.divide((test_features - self.mean_par), self.std_par)
            new_test_features = pd.DataFrame(new_test_features, columns = test_features_col)
            
        # Apply normalization
        elif self.scale_transform == 'normalization':
            new_test_features = np.divide((test_features - self.min_par), (self.max_par - self.min_par))
            new_test_features = pd.DataFrame(new_test_features, columns = test_features_col)
            
        # Apply unit-length scaling
        elif self.scale_transform == 'unit_length':
            new_test_features = np.divide(test_features, self.unit_norm)
            new_test_features = pd.DataFrame(new_test_features, columns = test_features_col)
            
        # No transformation of the testing features is required
        else:
            new_test_features = test_features
        
        return(new_test_features)

## Training/Test split

In [None]:
# Split the whole dataset in training/test set

def train_test(features, labels, test_set_proportion, seed):
    
    # Set a seed for the reproducibility of the results
    rd.seed(a=seed, version=2)
    
    # Retrieve the indexes of the training data points
    train_sample = rd.sample(features.index.tolist(), int(features.shape[0]*(1-test_set_proportion)))
    features_train = features.iloc[train_sample, :]
    
    # Retrieve the indexes of the testing data points
    test_sample = np.delete(features.index.tolist(), features_train.index)
    features_test = features.iloc[test_sample, :]
    
    # Retrieve the training and test labels
    labels_train = labels.iloc[train_sample,:]
    labels_test = labels.iloc[test_sample, :]

    # Re-counting the indexes of the variables from zero
    features_train = features_train.reset_index(drop = True) 
    labels_train = labels_train.reset_index(drop = True)
    
    features_test = features_test.reset_index(drop = True)
    labels_test = labels_test.reset_index(drop = True)
    
    return(features_train, features_test, labels_train, labels_test)