# **Libraries**

In [327]:
from abc import ABC, abstractclassmethod
import pandas as pd
import numpy as np

# **Data processing function**

## **Dev**

In [328]:
class DataProcessing(ABC):
  @abstractclassmethod
  def apply(self, df):
    pass

class DropMissingValue(DataProcessing):
  def apply(self, df):
    return df.dropna()

class ImputeMissingValue(DataProcessing):
  def __init__(self, imputation_list):
    '''Input:
       imputation_list : [(column_name1, option),
                          (column_name2, option),
                          (column_name3, option), ...]
       Available options: mean, median and mode'''
    self.imputation_list = imputation_list

  def apply(self, df):
    for (column_name, option) in self.imputation_list:
      if option == 'mode':
        df[column_name] = df[column_name].fillna(df[column_name].mode()[0])
      elif option == 'mean':
        df[column_name] = df[column_name].fillna(df[column_name].mean())
      elif option == 'median':
        df[column_name] = df[column_name].fillna(df[column_name].mean())
    return df

class DropColumn(DataProcessing):
  def __init__(self, column_list):
    '''Input:
    column_list : [column_name1,
                   column_name2,
                   column_name3,
                   ...]'''
    self.column_list = column_list

  def apply(self, df):
    return df.drop(self.column_list, axis = 1)

class DropDuplicate(DataProcessing):
  def apply(self, df):
    return df.drop_duplicates()

class ImputeOutlier(DataProcessing):
  def __init__(self, column_list):
    '''Input:
       column_list : [(column_name1, option),
                      (column_name2, option),
                      (column_name3, option),
                      ...]
       option: mean or median
    '''
    self.column_list = column_list

  def apply(self, df):
    def outlier_detection(df, column_name):
      q1 = df[column_name].quantile(0.25)
      q3 = df[column_name].quantile(0.75)
      IQR = q3 - q1
      lower_bound = q1 - 1.5 * IQR
      upper_bound = q3 + 1.5 * IQR
      return lower_bound, upper_bound
    for (column_name, option) in self.column_list:
      lower_bound, upper_bound = outlier_detection(df, column_name)
      if option == 'mean':
        df.loc[~((lower_bound < df[column_name]) & (df[column_name] < upper_bound)), [column_name]] = df[column_name].mean()
      elif option == 'median':
        df.loc[~((lower_bound < df[column_name]) & (df[column_name] < upper_bound)), [column_name]] = df[column_name].median()
    return df

class FloorCapOutlier(DataProcessing):
  def __init__(self, column_list):
    '''Input:
       column_list : [(column_name1, 0.1, 0.9),
                      (column_name2, 0.2, 0.8),
                      (column_name3, 0.05, 0.95),
                      ...]
      The structure: (column_name, lower, upper)
      0 < lower < upper < 1
    '''
    self.column_list = column_list

  def apply(self, df):
    for (column_name, lower, upper) in self.column_list:
      lower_bound = df[column_name].quantile(lower)
      upper_bound = df[column_name].quantile(upper)
      df.loc[df[column_name] > upper_bound, [column_name]] = upper_bound
      df.loc[df[column_name] < lower_bound, [column_name]] = lower_bound
    return df

class ChangeDataType(DataProcessing):
  def __init__(self, column_dictionary):
    '''Input:
    column_dictionary : {column_name1 : datatype1,
                         column_name2 : datatype2,
                         ...}'''
    self.column_dictionary = column_dictionary

  def apply(self, df):
    return df.astype(self.column_dictionary)

class Pipeline:
  def __init__(self):
    self.steps = []

  def add(self, function):
    self.steps.append(function)

  def run(self, df):
    for step in self.steps:
      df = step.apply(df)
    return df

## **Test**

### Impute missing value

In [329]:
data = {'cat1': ['Red'] * 6 + ['Blue'] * 4 + [None],
        'num1': [i + 1 for i in range(10)] + [None],
        'num2': [2, 2, 9, 4, 6, 5, 3, 10, 10, 12, None]}
df = pd.DataFrame(data)
print(df)
print()
print('Expected value from the 11th row:')
print('cat1: ' + df['cat1'].mode()[0])
print('num1: ' + str(df['num1'].median()))
print('num2: ' + str(df['num2'].mean()))
pipeline = Pipeline()
print()
pipeline.add(ImputeMissingValue([('cat1', 'mode'),
                                 ('num1', 'median'),
                                 ('num2', 'mean')]))
res = pipeline.run(df)
print(res)

    cat1  num1  num2
0    Red   1.0   2.0
1    Red   2.0   2.0
2    Red   3.0   9.0
3    Red   4.0   4.0
4    Red   5.0   6.0
5    Red   6.0   5.0
6   Blue   7.0   3.0
7   Blue   8.0  10.0
8   Blue   9.0  10.0
9   Blue  10.0  12.0
10  None   NaN   NaN

Expected value from the 11th row:
cat1: Red
num1: 5.5
num2: 6.3

    cat1  num1  num2
0    Red   1.0   2.0
1    Red   2.0   2.0
2    Red   3.0   9.0
3    Red   4.0   4.0
4    Red   5.0   6.0
5    Red   6.0   5.0
6   Blue   7.0   3.0
7   Blue   8.0  10.0
8   Blue   9.0  10.0
9   Blue  10.0  12.0
10   Red   5.5   6.3


### Drop missing value and drop column

In [330]:
# Example: Drop missing value from 11th row and drop 2 columns called num1 and cat1
print('Before: ')
data = {'cat1': ['Red'] * 6 + ['Blue'] * 4 + [None],
        'num1': [i + 1 for i in range(10)] + [None],
        'num2': [2, 2, 9, 4, 6, 5, 3, 10, 10, 12, None]}
df = pd.DataFrame(data)
print(df)
print()
print('After: ')
pipeline = Pipeline()
pipeline.add(DropMissingValue())
pipeline.add(DropColumn(['cat1', 'num1']))
res = pipeline.run(df)
print(res)

Before: 
    cat1  num1  num2
0    Red   1.0   2.0
1    Red   2.0   2.0
2    Red   3.0   9.0
3    Red   4.0   4.0
4    Red   5.0   6.0
5    Red   6.0   5.0
6   Blue   7.0   3.0
7   Blue   8.0  10.0
8   Blue   9.0  10.0
9   Blue  10.0  12.0
10  None   NaN   NaN

After: 
   num2
0   2.0
1   2.0
2   9.0
3   4.0
4   6.0
5   5.0
6   3.0
7  10.0
8  10.0
9  12.0


### Drop duplicate

In [331]:
data = {'cat1' : ['Red'] * 6 + ['Blue'],
        'cat2' : ['Black'] * 6 + ['White']}
df = pd.DataFrame(data)
print('Before: ')
print(df)
print()
print('After: ')
pipeline = Pipeline()
pipeline.add(DropDuplicate())
res = pipeline.run(df)
print(res)

Before: 
   cat1   cat2
0   Red  Black
1   Red  Black
2   Red  Black
3   Red  Black
4   Red  Black
5   Red  Black
6  Blue  White

After: 
   cat1   cat2
0   Red  Black
6  Blue  White


### Impute outlier

In [332]:
# The outlier for the data is 101. For this test, num1 column 101 will be replaced by mean, num2 column 101 will be replaced by median
data = {'num1': [15, 101, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9],
        'num2': [15, 101, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9]}
df = pd.DataFrame(data)
print('Before: ')
print(df)
print()
print('Mean: ' + str(df.num1.mean()))
print('Median: ' + str(df.num1.median()))
print()
print('After:')
pipeline = Pipeline()
pipeline.add(ImputeOutlier([('num1', 'mean'),
                            ('num2', 'median')]))
res = pipeline.run(df)
print(res)

Before: 
    num1  num2
0     15    15
1    101   101
2     18    18
3      7     7
4     13    13
5     16    16
6     11    11
7     21    21
8      5     5
9     15    15
10    10    10
11     9     9

Mean: 20.083333333333332
Median: 14.0

After:
         num1  num2
0   15.000000    15
1   20.083333    14
2   18.000000    18
3    7.000000     7
4   13.000000    13
5   16.000000    16
6   11.000000    11
7   21.000000    21
8    5.000000     5
9   15.000000    15
10  10.000000    10
11   9.000000     9


### Floor and Cap the Outlier

In [333]:
df = pd.DataFrame({'num1' : [15, 101, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9],
                   'num2' : [15, 101, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9],
                   'num3' : [i for i in range(12)]})
pipeline = Pipeline()
pipeline.add(FloorCapOutlier([('num1', 0.1, 0.9),
                              ('num2', 0.25, 0.75),
                              ('num3', 0.1, 0.9)]))
pipeline.run(df)

Unnamed: 0,num1,num2,num3
0,15.0,15.0,1.1
1,20.7,16.5,1.1
2,18.0,16.5,2.0
3,7.2,9.75,3.0
4,13.0,13.0,4.0
5,16.0,16.0,5.0
6,11.0,11.0,6.0
7,20.7,16.5,7.0
8,7.2,9.75,8.0
9,15.0,15.0,9.0


### Change Datatype

In [334]:
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': ['a', 'b', 'c', 'd', 'e'],
    'C': [1.1, '1.0', '1.3', 2, 5]})

pipeline = Pipeline()
pipeline.add(ChangeDataType({'A': float,
                             'C': float}))
pipeline.add(ChangeDataType({'A': float,
                             'C': int}))
res = pipeline.run(df)
res

Unnamed: 0,A,B,C
0,1.0,a,1
1,2.0,b,1
2,3.0,c,1
3,4.0,d,2
4,5.0,e,5


## **Result**

| Function | Result |
| --- | --- |
| Drop columns | Success |
| Drop missing value | Success |
| Impute missing value with mean, median and mode | Success |
| Drop duplicate | Success |
| Impute outlier | Success |
| Floor cap Outlier| Success |
| Change format datatype| Success |

References:
* https://www.analyticsvidhya.com/blog/2021/05/detecting-and-treating-outliers-treating-the-odd-one-out/