# Data Preprocessing

This file is split for preprocessing for each of the datasets used for the project.
In essence, raw files (`/datasets/raw`) get processed and put to (`/datasets/processed`), so that they can be freely loaded and used for the project.

**Requirements**
- for small datasets (labelled with [S]) there shuold be at most $10$ variables
- for large datasets (labelled with [L]) there should be more than $10$ variables.
- for ALL datasets the number of observations MUST be larger than the number of variables
- ALL datasets MUST have a clear binary target/response variable that should only take values $0$ or $1$
- ALL datasets MUST have less than $10$% missing data per variable
- for ALL datasets collinear variables SHOULD be removed
- EACH preprocessed dataset must be saved in a form of a single `.csv` file under `datasets/processed` for further use. Do not keep the dataframe index to minimize space used.

# Common tools

In [35]:
import os
from pathlib import Path
import pandas as pd
from scipy.io import arff

# use this to as base path for input datasets
DATASET_PATH = Path(os.getcwd()).parent / "datasets" 
RAW_DATASET_PATH = DATASET_PATH / "raw"
PROCESSED_DATASET_PATH = DATASET_PATH / "processed"

In [36]:
from statsmodels.stats.outliers_influence import variance_inflation_factor    


def calculate_vif_(X, thresh=5.0):
    X = X.assign(const=1)  # faster than add_constant from statsmodels
    variables = list(range(X.shape[1]))
    dropped = True
    while dropped:
        dropped = False
        vif = [variance_inflation_factor(X.iloc[:, variables].values, ix)
               for ix in range(X.iloc[:, variables].shape[1])]
        vif = vif[:-1]  # don't let the constant be removed in the loop.
        maxloc = vif.index(max(vif))
        if max(vif) > thresh:
            print('dropping \'' + X.iloc[:, variables].columns[maxloc] +
                  '\' at index: ' + str(maxloc))
            del variables[maxloc]
            dropped = True

    print('Remaining variables:')
    print(X.columns[variables[:-1]])
    return X.iloc[:, variables[:-1]]

# [S1] Diabetes

In [37]:
file = RAW_DATASET_PATH / "diabetes" / "diabetes.arff"

data, meta = arff.loadarff(file)

s1 = pd.DataFrame(data)
s1.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,b'tested_positive'
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,b'tested_negative'
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,b'tested_positive'
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,b'tested_negative'
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,b'tested_positive'


In [38]:
s1["class"] = s1["class"].str.decode("utf-8")
s1["class"] = s1["class"].apply(lambda x: 1 if x == "tested_positive" else 0)
s1.head()

Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1


In [39]:
calculate_vif_(s1, thresh=10) # no need to drop any columns

Remaining variables:
Index(['preg', 'plas', 'pres', 'skin', 'insu', 'mass', 'pedi', 'age', 'class'], dtype='object')


Unnamed: 0,preg,plas,pres,skin,insu,mass,pedi,age,class
0,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1
1,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0
2,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0
4,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1
...,...,...,...,...,...,...,...,...,...
763,10.0,101.0,76.0,48.0,180.0,32.9,0.171,63.0,0
764,2.0,122.0,70.0,27.0,0.0,36.8,0.340,27.0,0
765,5.0,121.0,72.0,23.0,112.0,26.2,0.245,30.0,0
766,1.0,126.0,60.0,0.0,0.0,30.1,0.349,47.0,1


In [40]:
# save
s1.to_csv(PROCESSED_DATASET_PATH / "diabetes.csv", index=False)

# [S2] Tour & Travels Customer Churn

# [S3] Seeds

# [L1] League of Legend Challenger Rank Game

# [L2] Jungle chess

# [L3] Patient Survival Prediction

# [L4] Hotel Booking Cancellation

# [L5] Ionosphere

# L[6] Sonar (Rock vs Mine)