In [1]:
from io import StringIO
import re

import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler

In [2]:
def findBinaryColumns(df):
    """ Receives a DataFrame and returns which columns are binary (only have 2 or less values)
    
    Args:
        df (DataFrame): Pandas DataFrame to be analyzed
    Returns:
        (list): List with the indexes of columns that are binary
    
    """
    dL = []
    for c in df.columns:
        uv = df[c].unique()
        if len(uv) < 3:
            print(c, df[c].unique())
            dL.append(c)
    print(dL)
    return dL

In [3]:
def getNoBinaryOut(df, dL):
    """ Receive a DataFrame and a list with binary and/or non-categorical 
    columns and returns which columns must be OH converted
    
    Args:
        df (DataFrame): Pandas DataFrame to be analyzed
        dL (list): List with columns that must be ignored
    Returns:
        (list): List of columns that must be one-hot encoded
    """
    co = 0
    if "output" in list(df.columns):
        co += 1
    return list(set([*range(df.shape[1]-co)])-(set(dL+["output"])))

In [4]:
def renameReorderNonOH(df):
    """ Remove columns name and replaces with a numeric sequence, 
    must be used with non one-hot encoded DataFrames
    
    Args:
        df (DataFrame): Pandas DataFrame to be processed, must not be one-hot encoded
    Returns:
        (DataFrame): Pandas DataFrame with columns organized by sequential numbers
    """
    if "output" in list(df.columns):
        df_d = df.drop(columns=["output"])
        df_o = df["output"]
        
        df_d.columns = [*range(df_d.shape[1])]

        df = pd.concat([df_d, df_o], axis=1)
    else:
        print("Warning: No output was found")
    
    return df

In [5]:
def renameReorderOH(df):
    """ Remove columns name and replaces with a numeric sequence,
    must be used with one-hot encoded DataFrames
    
    Args:
        df (DataFrame): Pandas DataFrame to be processed, must be one-hot encoded
    Returns:
        (DataFrame): Pandas DataFrame with columns orgnanized by sequential numbers
    """
    if "output" in list(df.columns):
        df_d = df.drop(columns=["output"])
        df_o = df["output"]
        
        # All data columns must be in string format to allow reorder
        df_d.columns = [str(c) for c in list(df_d.columns)]

        # Reorder data columns
        df_d = df_d.reindex(sorted((list(df_d.columns))),axis=1)

        # Output DF
        df = pd.concat([df_d, df_o], axis=1)
        
    else:
        print("Warning: No output was found")
        
        # This was done because in mixed datasets, in some situations the categorical
        # has values higher than 9, therefore, a simple sorting will not organize well
        # This is not needed in the case where has an output because these cases only
        # have categorical features. Although the code is correct and works, it may
        # be improved. With the below row also be used also in the other case.
        df.columns = ["0"*(4-len(str(c).split("_")[0]))+str(c) for c in list(df.columns)]
        
        df = df.reindex(sorted((list(df.columns))),axis=1)
    
    return df

In [6]:
# Balance Scale - all categorical

In [7]:
# Read raw data
dfbs = pd.read_csv("./rawData/balance-scale.data", header=None)

In [8]:
# Define output row
dfbs[0] = dfbs[0].map({"L":0, "R":1, "B":2})
dfbs.rename(columns={0: "output"}, inplace=True)

In [9]:
# Rename and Reorder columns
dfbs = renameReorderNonOH(dfbs)

In [10]:
# Find binary columns
dL = findBinaryColumns(dfbs)

[]


In [11]:
# Save non-OH dataset
dfbs.to_csv("./data/BalanceScale.csv", index=False)

In [12]:
# Generate OH version of DS
OHdfbs = pd.get_dummies(dfbs, columns=getNoBinaryOut(dfbs, dL))

In [13]:
# Rename and Reorder columns
OHdfbs = renameReorderOH(OHdfbs)

In [14]:
# Save OH dataset
OHdfbs.to_csv("./data/OH_BalanceScale.csv", index=False)

In [15]:
###########################################################################

In [16]:
# Car Evaluation - all categorical

In [17]:
# Read raw data
dfce = pd.read_csv("./rawData/car.data", header=None)

In [18]:
# Define output row
dfce[6] = dfce[6].map({"unacc":0, "acc":1, "vgood":2, "good": 3})
dfce.rename(columns={6: "output"}, inplace=True)

In [19]:
# Rename and Reorder columns
dfce = renameReorderNonOH(dfce)

In [20]:
# Find binary columns
dL = findBinaryColumns(dfce)

[]


In [21]:
# Save non-OH dataset
dfce.to_csv("./data/CarEvaluation.csv", index=False)

In [22]:
# Generate OH version of DS
OHdfce = pd.get_dummies(dfce, columns=getNoBinaryOut(dfce, dL))

In [23]:
# Rename and Reorder columns
OHdfce = renameReorderOH(OHdfce)

In [24]:
# Save OH dataset
OHdfce.to_csv("./data/OH_CarEvaluation.csv", index=False)

In [25]:
###########################################################################

In [26]:
# Hayes-Roth - all categorical

In [27]:
# Read raw data
dfhr = pd.read_csv("./rawData/hayes-roth.data", header=None).drop(columns=[0])

In [28]:
# Define output row
dfhr.rename(columns={5: "output"}, inplace=True)

In [29]:
# Rename and Reorder columns
dfhr = renameReorderNonOH(dfhr)

In [30]:
# Find binary columns
dL = findBinaryColumns(dfhr)

[]


In [31]:
# Save non-OH dataset
dfhr.to_csv("./data/HayesRoth.csv", index=False)

In [32]:
# Generate OH version of DS
OHdfhr = pd.get_dummies(dfhr, columns=getNoBinaryOut(dfhr, dL))

In [33]:
# Rename and Reorder columns
OHdfhr = renameReorderOH(OHdfhr)

In [34]:
# Save OH dataset
OHdfhr.to_csv("./data/OH_HayesRoth.csv", index=False)

In [35]:
###########################################################################

In [36]:
# Chess - all categorical

In [37]:
# Read raw data
dfchess = pd.read_csv("./rawData/krkopt.data", header=None)

In [38]:
# Define output row
dfchess[6] = dfchess[6].map({"draw":0, "zero":1, "one": 1, "two":1, "three": 1, "four": 1, "five": 1, "six": 1, 
                             "seven": 1, "eight": 1, "nine": 1, "ten": 1, "eleven": 1, "twelve": 1, "thirteen": 1, 
                            "fourteen": 1, "fifteen": 1, "sixteen": 1})
dfchess.rename(columns={6: "output"}, inplace=True)

In [39]:
# Rename and Reorder columns
dfchess = renameReorderNonOH(dfchess)

In [40]:
# Find binary columns
dL = findBinaryColumns(dfchess)

output [0 1]
['output']


In [41]:
# Save non-OH dataset
dfchess.to_csv("./data/Chess.csv", index=False)

In [42]:
# Generate OH version of DS
OHdfchess = pd.get_dummies(dfchess, columns=getNoBinaryOut(dfchess, dL))

In [43]:
# Rename and Reorder columns
OHdfchess = renameReorderOH(OHdfchess)

In [44]:
# Save OH dataset
OHdfchess.to_csv("./data/OH_Chess.csv", index=False)

In [45]:
###########################################################################

In [46]:
# Lymphography - all categorical

In [47]:
# Read raw data
dflym = pd.read_csv("./rawData/lymphography.data", header=None)

In [48]:
# Define output row
dflym.rename(columns={0: "output"}, inplace=True)

In [49]:
# Rename and Reorder columns
dflym = renameReorderNonOH(dflym)

In [50]:
# Find binary columns
dL = findBinaryColumns(dflym)

1 [2 1]
2 [1 2]
3 [1 2]
4 [1 2]
5 [1 2]
6 [1 2]
7 [2 1]
15 [1 2]
16 [2 1]
[1, 2, 3, 4, 5, 6, 7, 15, 16]


In [51]:
# Adjust binary variables to be 0 or 1
dflym[1] = dflym[1].map({1:0, 2:1})
dflym[2] = dflym[2].map({1:0, 2:1})
dflym[3] = dflym[3].map({1:0, 2:1})
dflym[4] = dflym[4].map({1:0, 2:1})
dflym[5] = dflym[5].map({1:0, 2:1})
dflym[6] = dflym[6].map({1:0, 2:1})
dflym[7] = dflym[7].map({1:0, 2:1})
dflym[15] = dflym[15].map({1:0, 2:1})
dflym[16] = dflym[16].map({1:0, 2:1})

In [52]:
# Find binary columns
dL = findBinaryColumns(dflym)

1 [1 0]
2 [0 1]
3 [0 1]
4 [0 1]
5 [0 1]
6 [0 1]
7 [1 0]
15 [0 1]
16 [1 0]
[1, 2, 3, 4, 5, 6, 7, 15, 16]


In [53]:
# Save non-OH dataset
dflym.to_csv("./data/Lymphography.csv", index=False)

In [54]:
# Generate OH version of DS
OHdflym = pd.get_dummies(dflym, columns=getNoBinaryOut(dflym, dL))

In [55]:
# Rename and Reorder columns
OHdflym = renameReorderOH(OHdflym)

In [56]:
# Save OH dataset
OHdflym.to_csv("./data/OH_Lymphography.csv", index=False)

In [57]:
###########################################################################

In [58]:
# Nursery - all categorical

In [59]:
# Read raw data
dfnur = pd.read_csv("./rawData/nursery.data", header=None)

In [60]:
# Define output row
dfnur[8] = dfnur[8].map({"not_recom":0, "recommend":1, "priority":2, "very_recom": 5, "spec_prior": 3})
dfnur.rename(columns={8: "output"}, inplace=True)

In [61]:
# Rename and Reorder columns
dfnur = renameReorderNonOH(dfnur)

In [62]:
# Find binary columns
dL = findBinaryColumns(dfnur)

5 ['convenient' 'inconv']
[5]


In [63]:
# Adjust column to be binary
dfnur[5] = dfnur[5].map({"convenient":0, "inconv":1})

In [64]:
dL = findBinaryColumns(dfnur)

5 [0 1]
[5]


In [65]:
# Save non-OH dataset
dfnur.to_csv("./data/Nursery.csv", index=False)

In [66]:
# Generate OH version of DS
OHdfnur = pd.get_dummies(dfnur, columns=getNoBinaryOut(dfnur, dL))

In [67]:
# Rename and Reorder columns
OHdfnur = renameReorderOH(OHdfnur)

In [68]:
# Save OH dataset
OHdfnur.to_csv("./data/OH_Nursery.csv", index=False)

In [69]:
###########################################################################

In [70]:
# Soybean (small) - all categorical

In [71]:
# Read raw data
dfsoy = pd.read_csv("./rawData/soybean-small.data", header=None)

In [72]:
# Define output row
dfsoy[35] = dfsoy[35].map({"D1":0, "D2":1, "D3":2, "D4":3})
dfsoy.rename(columns={35: "output"}, inplace=True)

In [73]:
# Rename and Reorder columns
dfsoy = renameReorderNonOH(dfsoy)

In [74]:
# Find binary columns
dL = findBinaryColumns(dfsoy)

1 [0 1]
4 [1 0]
7 [1 2]
8 [0 1]
10 [1]
11 [1 0]
12 [0]
13 [2]
14 [2]
15 [0]
16 [0]
17 [0]
18 [1]
19 [0 1]
22 [1 0]
23 [1 0]
24 [0 1]
25 [0 2]
26 [0 1]
27 [0 3]
28 [4]
29 [0]
30 [0]
31 [0]
32 [0]
33 [0]
34 [0 1]
[1, 4, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]


In [75]:
# Adjust binary columns to have only 0 or/and 1
dfsoy[28] = dfsoy[28].map({4:1})
dfsoy[27] = dfsoy[27].map({0:0, 3:1})
dfsoy[25] = dfsoy[25].map({0:0, 2:1})
dfsoy[14] = dfsoy[14].map({2:1})
dfsoy[13] = dfsoy[13].map({2:1})
dfsoy[7] = dfsoy[7].map({1:0, 2:1})

In [76]:
dL = findBinaryColumns(dfsoy)

1 [0 1]
4 [1 0]
7 [0 1]
8 [0 1]
10 [1]
11 [1 0]
12 [0]
13 [1]
14 [1]
15 [0]
16 [0]
17 [0]
18 [1]
19 [0 1]
22 [1 0]
23 [1 0]
24 [0 1]
25 [0 1]
26 [0 1]
27 [0 1]
28 [1]
29 [0]
30 [0]
31 [0]
32 [0]
33 [0]
34 [0 1]
[1, 4, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]


In [77]:
# Save non-OH dataset
dfsoy.to_csv("./data/SoybeanSmall.csv", index=False)

In [78]:
# Generate OH version of DS
OHdfsoy = pd.get_dummies(dfsoy, columns=getNoBinaryOut(dfsoy, dL))

In [79]:
# Rename and Reorder columns
OHdfsoy = renameReorderOH(OHdfsoy)

In [80]:
# Save OH dataset
OHdfsoy.to_csv("./data/OH_SoybeanSmall.csv", index=False)

In [81]:
###########################################################################

In [82]:
# Tic-Tac-Toe - all categorical

In [83]:
# Read raw data
dfttt = pd.read_csv("./rawData/tic-tac-toe.data", header=None)

In [84]:
# Define output row
dfttt[9] = dfttt[9].map({"negative":0, "positive":1})
dfttt.rename(columns={9: "output"}, inplace=True)

In [85]:
# Rename and Reorder columns
dfttt = renameReorderNonOH(dfttt)

In [86]:
# Find binary columns
dL = findBinaryColumns(dfttt)

output [1 0]
['output']


In [87]:
# Save non-OH dataset
dfttt.to_csv("./data/TicTacToe.csv", index=False)

In [88]:
# Generate OH version of DS
OHdfttt = pd.get_dummies(dfttt, columns=getNoBinaryOut(dfttt, dL))

In [89]:
# Rename and Reorder columns
OHdfttt = renameReorderOH(OHdfttt)

In [90]:
# Save OH dataset
OHdfttt.to_csv("./data/OH_TicTacToe.csv", index=False)

In [91]:
###########################################################################

In [92]:
def verifyCell(df):
    """ Verify columns without numerical values
    
    Args:
        df (DataFrame): DataFrame to be analyzed
    Returns:
        (None)
    """
    for i in range(df.shape[1]):
        # Identify not number like
        if (df[i].map(lambda x: len(re.findall(r'[^\d\.\-|e]', str(x)))).sum() > 0):
            print(i)
        

In [93]:
# Create a scaler to deal with numerical features
scaler = StandardScaler()

In [94]:
###########################################################################

In [95]:
# BCW - all numerical

In [96]:
# Read raw data
dfbcw = pd.read_csv("./rawData/wpbc.data", header=None)

In [97]:
# Verify columns with non-numeric entries
verifyCell(dfbcw)
# Column 1 is the output, while column 24 has some few missing values

1
34


In [98]:
# Delete column 0 as it is a Index and column 34 because it has missing values
dfbcw.drop(columns=[0, 34], inplace=True)

In [99]:
# Define output row
dfbcw[1] = dfbcw[1].map({"N":0, "R":1})
dfbcw.rename(columns={1: "output"}, inplace=True)

In [100]:
# Rename and Reorder columns
dfbcw = renameReorderNonOH(dfbcw)

In [101]:
# Find binary columns
dL = findBinaryColumns(dfbcw)

output [0 1]
['output']


In [102]:
# Save non-OH dataset
dfbcw.to_csv("./data/BCW.csv", index=False)

In [103]:
# Generate Normalized version of data
dfbcwNORM = dfbcw.copy()
dfbcwNORM.loc[:,:32] = scaler.fit_transform(dfbcwNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/BCW.sc', 'wb'))

In [104]:
# Save normalized data
dfbcwNORM.to_csv("./data/NORM_BCW.csv", index=False)

In [105]:
###########################################################################

In [106]:
# Ecoli - all numerical

In [107]:
# Read raw data
dfeco = pd.read_csv(StringIO(re.sub(' +', ' ', open("./rawData/ecoli.data").read())), header=None, sep=" ")

In [108]:
# Verify columns with non-numeric entries
verifyCell(dfeco)

0
8


In [109]:
# Delete column 0 because it has another label that is not used in this task
dfeco.drop(columns=[0], inplace=True)

In [110]:
# Define output row
dfeco[8] = dfeco[8].map({"cp":0, "im":1, "imS":2, "imL":3, "imU":4, "om":5, "omL":6, "pp":7})
dfeco.rename(columns={8: "output"}, inplace=True)

In [111]:
# Rename and Reorder columns
dfeco = renameReorderNonOH(dfeco)

In [112]:
# Find binary columns
dL = findBinaryColumns(dfeco)

2 [0.48 1.  ]
3 [0.5 1. ]
[2, 3]


In [113]:
# Save non-OH dataset
dfeco.to_csv("./data/Ecoli.csv", index=False)

In [114]:
# Generate Normalized version of data
dfecoNORM = dfeco.copy()
dfecoNORM.loc[:,:7] = scaler.fit_transform(dfecoNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/Ecoli.sc', 'wb'))

In [115]:
# Save normalized data
dfecoNORM.to_csv("./data/NORM_Ecoli.csv", index=False)

In [116]:
###########################################################################

In [117]:
# Iris - all numerical

In [118]:
# Read raw data
dfiri = pd.read_csv("./rawData/iris.data", header=None)

In [119]:
# Verify columns with non-numeric entries
verifyCell(dfiri)

4


In [120]:
# No columns to be deleted

In [121]:
# Define output row
dfiri[4] = dfiri[4].map({"Iris-setosa":0, "Iris-versicolor":1, "Iris-virginica": 3})
dfiri.rename(columns={4: "output"}, inplace=True)

In [122]:
# Rename and Reorder columns
dfiri = renameReorderNonOH(dfiri)

In [123]:
# Find binary columns
dL = findBinaryColumns(dfiri)

[]


In [124]:
# Save non-OH dataset
dfiri.to_csv("./data/Iris.csv", index=False)

In [125]:
# Generate Normalized version of data
dfiriNORM = dfiri.copy()
dfiriNORM.loc[:,:4] = scaler.fit_transform(dfiriNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/Iris.sc', 'wb'))

In [126]:
# Save normalized data
dfiriNORM.to_csv("./data/NORM_Iris.csv", index=False)

In [127]:
###########################################################################

In [128]:
# ISOLET - all numerical

In [129]:
# Read raw data
dfiso1 = pd.read_csv("./rawData/isolet1+2+3+4.data", header=None)
dfiso2 = pd.read_csv("./rawData/isolet5.data", header=None)

dfiso = pd.concat([dfiso1, dfiso2])

In [130]:
# Verify columns with non-numeric entries
verifyCell(dfiso)

In [131]:
# No columns to be deleted

In [132]:
# Define output row
dfiso.rename(columns={617: "output"}, inplace=True)

In [133]:
# Rename and Reorder columns
dfiso = renameReorderNonOH(dfiso)

In [134]:
# Find binary columns
dL = findBinaryColumns(dfiso)

577 [-1.  1.]
578 [ 1. -1.]
579 [-1.  1.]
584 [-1.  1.]
[577, 578, 579, 584]


In [135]:
# Save non-OH dataset
dfiso.to_csv("./data/ISOLET.csv", index=False)

In [136]:
# Generate Normalized version of data
dfisoNORM = dfiso.copy()
dfisoNORM.loc[:,:617] = scaler.fit_transform(dfisoNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/ISOLET.sc', 'wb'))

In [137]:
# Save normalized data
dfisoNORM.to_csv("./data/NORM_ISOLET.csv", index=False)

In [138]:
###########################################################################

In [139]:
# SDD - all numerical

In [140]:
# Read raw data
dfsdd = pd.read_csv("./rawData/Sensorless_drive_diagnosis.txt", header=None, sep=" ")

In [141]:
# Verify columns with non-numeric entries
verifyCell(dfsdd)

In [142]:
# No columns to be deleted

In [143]:
# Define output row
dfsdd.rename(columns={48: "output"}, inplace=True)

In [144]:
# Rename and Reorder columns
dfsdd = renameReorderNonOH(dfsdd)

In [145]:
# Find binary columns
dL = findBinaryColumns(dfsdd)

[]


In [146]:
# Save non-OH dataset
dfsdd.to_csv("./data/SDD.csv", index=False)

In [147]:
# Generate Normalized version of data
dfsddNORM = dfsdd.copy()
dfsddNORM.loc[:,:48] = scaler.fit_transform(dfsddNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/SDD.sc', 'wb'))

In [148]:
# Save normalized data
dfsddNORM.to_csv("./data/NORM_SDD.csv", index=False)

In [149]:
###########################################################################

In [150]:
# PBC - all numerical

In [151]:
# Read raw data
dfpbc = pd.read_csv(StringIO(re.sub(' +', ' ', open("./rawData/page-blocks.data").read())), header=None, sep=" ")

In [152]:
# Verify columns with non-numeric entries
verifyCell(dfpbc)

0


In [153]:
# Delete column 0 as it is an artifact from import
dfpbc.drop(columns=[0], inplace=True)

In [154]:
# Define output row
dfpbc.rename(columns={11: "output"}, inplace=True)

In [155]:
# Rename and Reorder columns
dfpbc = renameReorderNonOH(dfpbc)

In [156]:
# Find binary columns
dL = findBinaryColumns(dfpbc)

[]


In [157]:
# Save non-OH dataset
dfpbc.to_csv("./data/PBC.csv", index=False)

In [158]:
# Generate Normalized version of data
dfpbcNORM = dfpbc.copy()
dfpbcNORM.loc[:,:10] = scaler.fit_transform(dfpbcNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/PBC.sc', 'wb'))

In [159]:
# Save normalized data
dfpbcNORM.to_csv("./data/NORM_PBC.csv", index=False)

In [160]:
###########################################################################

In [161]:
# CMSC - all numerical

In [162]:
# Read raw data
dfcmsc = pd.read_csv(StringIO(re.sub(' +', ' ',"\n".join(" ".join(item for item in line.split(" ") if item != "") for line in open("./rawData/pop_failures.dat").read().split("\n")))), header=None, sep=" ")
dfcmsc = dfcmsc.drop(0).reset_index(drop=True)
dfcmsc = dfcmsc.drop(columns=[0,1])
dfcmsc.columns = [*range(dfcmsc.shape[1])]

In [163]:
# Verify columns with non-numeric entries
verifyCell(dfcmsc)

In [164]:
# No columns to be deleted

In [165]:
# Define output row
dfcmsc.rename(columns={18: "output"}, inplace=True)

In [166]:
# Rename and Reorder columns
dfcmsc = renameReorderNonOH(dfcmsc)

In [167]:
# Find binary columns
dL = findBinaryColumns(dfcmsc)

output ['0' '1']
['output']


In [168]:
# Save non-OH dataset
dfcmsc.to_csv("./data/CMSC.csv", index=False)

In [169]:
# Generate Normalized version of data
dfcmscNORM = dfcmsc.copy()
dfcmscNORM.loc[:,:18] = scaler.fit_transform(dfcmscNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/CMSC.sc', 'wb'))

In [170]:
# Save normalized data
dfcmscNORM.to_csv("./data/NORM_CMSC.csv", index=False)

In [171]:
###########################################################################

In [172]:
# MAGIC GT - all numerical

In [173]:
# Read raw data
dfmagt = pd.read_csv("./rawData/magic04.data", header=None)

In [174]:
# Verify columns with non-numeric entries
verifyCell(dfmagt)

10


In [175]:
# No columns to be deleted

In [176]:
# Define output row
dfmagt[10] = dfmagt[10].map({"g":0, "h":1})
dfmagt.rename(columns={10: "output"}, inplace=True)

In [177]:
# Rename and Reorder columns
dfmagt = renameReorderNonOH(dfmagt)

In [178]:
# Find binary columns
dL = findBinaryColumns(dfmagt)

output [0 1]
['output']


In [179]:
# Save non-OH dataset
dfmagt.to_csv("./data/MagicGT.csv", index=False)

In [180]:
# Generate Normalized version of data
dfmagtNORM = dfmagt.copy()
dfmagtNORM.loc[:,:10] = scaler.fit_transform(dfmagtNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/MagicGT.sc', 'wb'))

In [181]:
# Save normalized data
dfmagtNORM.to_csv("./data/NORM_MagicGT.csv", index=False)

In [182]:
###########################################################################

In [183]:
# Wine - all numerical

In [184]:
# Read raw data
dfwin = pd.read_csv("./rawData/wine.data", header=None)

In [185]:
# Verify columns with non-numeric entries
verifyCell(dfwin)

In [186]:
# No columns to be deleted

In [187]:
# Define output row
dfwin.rename(columns={0: "output"}, inplace=True)

In [188]:
# Rename and Reorder columns
dfwin = renameReorderNonOH(dfwin)

In [189]:
# Find binary columns
dL = findBinaryColumns(dfwin)

[]


In [190]:
# Save non-OH dataset
dfwin.to_csv("./data/Wine.csv", index=False)

In [191]:
# Generate Normalized version of data
dfwinNORM = dfwin.copy()
dfwinNORM.loc[:,:13] = scaler.fit_transform(dfwinNORM.drop(columns=["output"]))
pickle.dump(scaler, open(f'./scalers/Wine.sc', 'wb'))

In [192]:
# Save normalized data
dfwinNORM.to_csv("./data/NORM_Wine.csv", index=False)

In [193]:
###########################################################################

In [194]:
# Default of CCC
# After Clean:
# Numerical [0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]
# Categorical [1, 2, 3, 5, 6, 7, 8, 9, 10]

In [195]:
# Read raw data
dfdccc = pd.read_excel("./rawData/default of credit card clients.xls", header=None).drop(columns=[0])
dfdccc = dfdccc.drop([0,1])
dfdccc = dfdccc.reset_index(drop=True)
dfdccc.columns = [*range(dfdccc.shape[1])]

In [196]:
# Create and rename columns of Numerical DF
dfdcccNUM = dfdccc[[0, 4, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]].copy()
dfdcccNUM.columns = [*range(dfdcccNUM.shape[1])]

In [197]:
# Create and remane columns of Categorical DF
dfdcccCAT = dfdccc[[1, 2, 3, 5, 6, 7, 8, 9, 10]].copy()
dfdcccCAT.columns = [*range(dfdcccCAT.shape[1])]

In [198]:
# Create output column
dfdccc.rename(columns={23: "output"}, inplace=True)
dfdcccOUT = dfdccc["output"].copy()

In [199]:
# Treat Numerical Variables

In [200]:
# Verify columns with non-numeric entries
verifyCell(dfdcccNUM)

In [201]:
# No columns to be deleted

In [202]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfdcccNUM = renameReorderNonOH(dfdcccNUM)



In [203]:
# Find binary columns
dL = findBinaryColumns(dfdcccNUM)

[]


In [204]:
# Save non-OH dataset

In [205]:
# Generate Normalized version of data
dfdcccNUM_NORM = dfdcccNUM.copy()
dfdcccNUM_NORM.loc[:,:14] = scaler.fit_transform(dfdcccNUM_NORM)
pickle.dump(scaler, open(f'./scalers/DefaultOfCCC.sc', 'wb'))

In [206]:
# Treat Categorical Variables

In [207]:
# Rename and Reorder columns

In [208]:
dL = findBinaryColumns(dfdcccCAT)

0 [2 1]
[0]


In [209]:
dfdcccCAT[0] = dfdcccCAT[0].map({1:0, 2:1})

In [210]:
dL = findBinaryColumns(dfdcccCAT)

0 [1 0]
[0]


In [211]:
# Generate OH version of DS
dfdcccCAT_OH = pd.get_dummies(dfdcccCAT, columns=getNoBinaryOut(dfdcccCAT, dL))

In [212]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfdcccCAT_OH = renameReorderOH(dfdcccCAT_OH)



In [213]:
# Rename Numerical columns to not have repeated column names
dfdcccNUM.columns = [c+dfdcccCAT.shape[1] for c in range(dfdcccNUM.shape[1])]
dfdcccNUM_NORM.columns = [c+dfdcccCAT.shape[1] for c in range(dfdcccNUM_NORM.shape[1])]

In [214]:
# Create non-treated dataset
pd.concat([dfdcccCAT, dfdcccNUM, dfdcccOUT], axis=1).to_csv("./data/DefaultOfCCC.csv", index=False)

In [215]:
# Create treated datasets
pd.concat([dfdcccCAT_OH, dfdcccNUM, dfdcccOUT], axis=1).to_csv("./data/OH_DefaultOfCCC.csv", index=False)
pd.concat([dfdcccCAT, dfdcccNUM_NORM, dfdcccOUT], axis=1).to_csv("./data/NORM_DefaultOfCCC.csv", index=False)
pd.concat([dfdcccCAT_OH, dfdcccNUM_NORM, dfdcccOUT], axis=1).to_csv("./data/OH_NORM_DefaultOfCCC.csv", index=False)

In [216]:
###########################################################################

In [217]:
# Student Performance (Port)
# After Clean:
# Numerical [2, 6, 7, 12, 13, 14, 23, 24, 25, 26, 27, 28, 29]
# Categorical [0, 1, 3, 4, 5, 8, 9, 10, 11, 15, 16, 17, 18, 19, 20, 21, 22]

In [218]:
# Read raw data
dfspp = pd.read_csv("./rawData/student-por.csv", sep=";")
dfspp = dfspp.drop(columns=["G1", "G2"])
dfspp["G3"] = dfspp["G3"].map(lambda x: int(x > dfspp["G3"].mean()))
dfspp.columns = [*range(dfspp.shape[1])]

In [219]:
# Create and rename columns of Numerical DF
dfsppNUM = dfspp[[2, 6, 7, 12, 13, 14, 23, 24, 25, 26, 27, 28, 29]].copy()
dfsppNUM.columns = [*range(dfsppNUM.shape[1])]

In [220]:
# Create and remane columns of Categorical DF
dfsppCAT = dfspp[[0, 1, 3, 4, 5, 8, 9, 10, 11, 15, 16, 17, 18, 19, 20, 21, 22]].copy()
dfsppCAT.columns = [*range(dfsppCAT.shape[1])]

In [221]:
# Create output column
dfspp.rename(columns={30: "output"}, inplace=True)
dfsppOUT = dfspp["output"].copy()

In [222]:
# Treat Numerical Variables

In [223]:
# Verify columns with non-numeric entries
verifyCell(dfsppNUM)

In [224]:
# Delete columns without all numbers, except output

In [225]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfsppNUM = renameReorderNonOH(dfsppNUM)



In [226]:
# Find binary columns
dL = findBinaryColumns(dfsppNUM)

[]


In [227]:
# Save non-OH dataset

In [228]:
# Generate Normalized version of data
dfsppNUM_NORM = dfsppNUM.copy()
dfsppNUM_NORM.loc[:,:13] = scaler.fit_transform(dfsppNUM_NORM)
pickle.dump(scaler, open(f'./scalers/StudentPerf.sc', 'wb'))

In [229]:
# Treat Categorical Variables

In [230]:
# Rename and Reorder columns

In [231]:
dL = findBinaryColumns(dfsppCAT)

0 ['GP' 'MS']
1 ['F' 'M']
2 ['U' 'R']
3 ['GT3' 'LE3']
4 ['A' 'T']
9 ['yes' 'no']
10 ['no' 'yes']
11 ['no' 'yes']
12 ['no' 'yes']
13 ['yes' 'no']
14 ['yes' 'no']
15 ['no' 'yes']
16 ['no' 'yes']
[0, 1, 2, 3, 4, 9, 10, 11, 12, 13, 14, 15, 16]


In [232]:
# Adjust binary variables to have only 0 and/or 1
dfsppCAT[0] = dfsppCAT[0].map({"GP": 0, "MS":1})
dfsppCAT[1] = dfsppCAT[1].map({"F": 0, "M":1})
dfsppCAT[2] = dfsppCAT[2].map({"U": 0, "R":1})
dfsppCAT[3] = dfsppCAT[3].map({"GT3": 0, "LE3":1})
dfsppCAT[4] = dfsppCAT[4].map({"A": 0, "T":1})
dfsppCAT[9] = dfsppCAT[9].map({"no": 0, "yes":1})
dfsppCAT[10] = dfsppCAT[10].map({"no": 0, "yes":1})
dfsppCAT[11] = dfsppCAT[11].map({"no": 0, "yes":1})
dfsppCAT[12] = dfsppCAT[12].map({"no": 0, "yes":1})
dfsppCAT[13] = dfsppCAT[13].map({"no": 0, "yes":1})
dfsppCAT[14] = dfsppCAT[14].map({"no": 0, "yes":1})
dfsppCAT[15] = dfsppCAT[15].map({"no": 0, "yes":1})
dfsppCAT[16] = dfsppCAT[16].map({"no": 0, "yes":1})

In [233]:
dL = findBinaryColumns(dfsppCAT)

0 [0 1]
1 [0 1]
2 [0 1]
3 [0 1]
4 [0 1]
9 [1 0]
10 [0 1]
11 [0 1]
12 [0 1]
13 [1 0]
14 [1 0]
15 [0 1]
16 [0 1]
[0, 1, 2, 3, 4, 9, 10, 11, 12, 13, 14, 15, 16]


In [234]:
# Generate OH version of DS
dfsppCAT_OH = pd.get_dummies(dfsppCAT, columns=getNoBinaryOut(dfsppCAT, dL))

In [235]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfsppCAT_OH = renameReorderOH(dfsppCAT_OH)



In [236]:
# Rename Numerical columns to not have repeated column names
dfsppNUM.columns = [c+dfsppCAT.shape[1] for c in range(dfsppNUM.shape[1])]
dfsppNUM_NORM.columns = [c+dfsppCAT.shape[1] for c in range(dfsppNUM_NORM.shape[1])]

In [237]:
# Create non-treated dataset
pd.concat([dfsppCAT, dfsppNUM, dfsppOUT], axis=1).to_csv("./data/StudentPerf.csv", index=False)

In [238]:
# Create treated datasets
pd.concat([dfsppCAT_OH, dfsppNUM, dfsppOUT], axis=1).to_csv("./data/OH_StudentPerf.csv", index=False)
pd.concat([dfsppCAT, dfsppNUM_NORM, dfsppOUT], axis=1).to_csv("./data/NORM_StudentPerf.csv", index=False)
pd.concat([dfsppCAT_OH, dfsppNUM_NORM, dfsppOUT], axis=1).to_csv("./data/OH_NORM_StudentPerf.csv", index=False)

In [239]:
###########################################################################

In [240]:
# Adult
# After Clean:
# Numerical [0, 2, 4, 10, 11, 12]
# Categorical [1, 3, 5, 6, 7, 8, 9, 13]

In [241]:
# Read raw data
dfadu = pd.read_csv(StringIO(re.sub(' +', '', open("./rawData/adult.data").read())), header=None)
dfadu[14] = dfadu[14].map({"<=50K":0, ">50K":1})

In [242]:
# Create and rename columns of Numerical DF
dfaduNUM = dfadu[[0, 2, 4, 10, 11, 12]].copy()
dfaduNUM.columns = [*range(dfaduNUM.shape[1])]

In [243]:
# Create and remane columns of Categorical DF
dfaduCAT = dfadu[[1, 3, 5, 6, 7, 8, 9, 13]].copy()
dfaduCAT.columns = [*range(dfaduCAT.shape[1])]

In [244]:
# Create output column
dfadu.rename(columns={14: "output"}, inplace=True)
dfaduOUT = dfadu["output"].copy()

In [245]:
# Treat Numerical Variables

In [246]:
# Verify columns with non-numeric entries
verifyCell(dfaduNUM)

In [247]:
# Delete columns without all numbers, except output

In [248]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfaduNUM = renameReorderNonOH(dfaduNUM)



In [249]:
# Find binary columns
dL = findBinaryColumns(dfaduNUM)

[]


In [250]:
# Save non-OH dataset

In [251]:
# Generate Normalized version of data
dfaduNUM_NORM = dfaduNUM.copy()
dfaduNUM_NORM.loc[:,:6] = scaler.fit_transform(dfaduNUM)
pickle.dump(scaler, open(f'./scalers/Adult.sc', 'wb'))

In [252]:
# Treat Categorical Variables

In [253]:
# Rename and Reorder columns

In [254]:
dL = findBinaryColumns(dfaduCAT)

6 ['Male' 'Female']
[6]


In [255]:
# Adjust binary features to only have 0 and/or 1
dfaduCAT[6] = dfaduCAT[6].map({"Male":0, "Female":1})

In [256]:
dL = findBinaryColumns(dfaduCAT)

6 [0 1]
[6]


In [257]:
# Generate OH version of DS
dfaduCAT_OH = pd.get_dummies(dfaduCAT, columns=getNoBinaryOut(dfaduCAT, dL))

In [258]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfaduCAT_OH = renameReorderOH(dfaduCAT_OH)



In [259]:
# Rename Numerical columns to not have repeated column names
dfaduNUM.columns = [c+dfaduCAT.shape[1] for c in range(dfaduNUM.shape[1])]
dfaduNUM_NORM.columns = [c+dfaduCAT.shape[1] for c in range(dfaduNUM_NORM.shape[1])]

In [260]:
# Create non-treated dataset
pd.concat([dfaduCAT, dfaduNUM, dfaduOUT], axis=1).to_csv("./data/Adult.csv", index=False)

In [261]:
# Create treated datasets
pd.concat([dfaduCAT_OH, dfaduNUM, dfaduOUT], axis=1).to_csv("./data/OH_Adult.csv", index=False)
pd.concat([dfaduCAT, dfaduNUM_NORM, dfaduOUT], axis=1).to_csv("./data/NORM_Adult.csv", index=False)
pd.concat([dfaduCAT_OH, dfaduNUM_NORM, dfaduOUT], axis=1).to_csv("./data/OH_NORM_Adult.csv", index=False)

In [262]:
###########################################################################

In [263]:
# Internet Adv.
# After Clean:
# Numerical [0,1,2]
# Categorical [3:1558]

In [264]:
# Read raw data
dfiadv = pd.read_csv("./rawData/ad.data", header=None)
dfiadv[1558] = dfiadv[1558].map({"nonad.":0, "ad.":1})
dfiadv[0] = dfiadv[0].map(lambda x: "".join(str(x).split()))
dfiadv[1] = dfiadv[1].map(lambda x: "".join(str(x).split()))
dfiadv[2] = dfiadv[2].map(lambda x: "".join(str(x).split()))
dfiadv[3] = dfiadv[3].map(lambda x: "".join(str(x).split()))
dfiadv = dfiadv[dfiadv[2].map(lambda x: not bool(re.findall(r'[^\d\.\-\ |e]', str(x))))]
dfiadv = dfiadv[dfiadv[3].map(lambda x: not bool(re.findall(r'[^\d\.\-\ |e]', str(x))))]

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [265]:
# Create and rename columns of Numerical DF
dfiadvNUM = dfiadv[[0,1,2]].copy()
dfiadvNUM.columns = [*range(dfiadvNUM.shape[1])]

In [266]:
# Create and remane columns of Categorical DF
dfiadvCAT = dfiadv.loc[:, 3:1557].copy()
dfiadvCAT.columns = [*range(dfiadvCAT.shape[1])]

In [267]:
# Create output column
dfiadv.rename(columns={1558: "output"}, inplace=True)
dfiadvOUT = dfiadv["output"].copy()

In [268]:
# Treat Numerical Variables

In [269]:
# Verify columns with non-numeric entries
verifyCell(dfiadvNUM)

In [270]:
# Delete columns without all numbers, except output

In [271]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfiadvNUM = renameReorderNonOH(dfiadvNUM)



In [272]:
# Find binary columns
dL = findBinaryColumns(dfiadvNUM)

[]


In [273]:
# Save non-OH dataset

In [274]:
# Generate Normalized version of data
dfiadvNUM_NORM = dfiadvNUM.copy()
dfiadvNUM_NORM.loc[:,:14] = scaler.fit_transform(dfiadvNUM_NORM)
pickle.dump(scaler, open(f'./scalers/InternetAdv.sc', 'wb'))

In [275]:
# Treat Categorical Variables

In [276]:
# Rename and Reorder columns

In [277]:
dL = findBinaryColumns(dfiadvCAT)

0 ['1' '0']
1 [0 1]
2 [0]
3 [0 1]
4 [0 1]
5 [0 1]
6 [0 1]
7 [0 1]
8 [0 1]
9 [0 1]
10 [0]
11 [0 1]
12 [0 1]
13 [0 1]
14 [0 1]
15 [0 1]
16 [0 1]
17 [0 1]
18 [0 1]
19 [0 1]
20 [0 1]
21 [0 1]
22 [0 1]
23 [0 1]
24 [0 1]
25 [0 1]
26 [0 1]
27 [0 1]
28 [0 1]
29 [0 1]
30 [0]
31 [0 1]
32 [0 1]
33 [0 1]
34 [0 1]
35 [0]
36 [0 1]
37 [0 1]
38 [0 1]
39 [0 1]
40 [0 1]
41 [0 1]
42 [0]
43 [0 1]
44 [0 1]
45 [0 1]
46 [0 1]
47 [0 1]
48 [0 1]
49 [0 1]
50 [0 1]
51 [0 1]
52 [0 1]
53 [0 1]
54 [0 1]
55 [0 1]
56 [0 1]
57 [0 1]
58 [0]
59 [0]
60 [0 1]
61 [0 1]
62 [0 1]
63 [0 1]
64 [0 1]
65 [0 1]
66 [0 1]
67 [0 1]
68 [0 1]
69 [0 1]
70 [0 1]
71 [0 1]
72 [0 1]
73 [0 1]
74 [0]
75 [0]
76 [0 1]
77 [0 1]
78 [0 1]
79 [0]
80 [0 1]
81 [0 1]
82 [0 1]
83 [0 1]
84 [0 1]
85 [0 1]
86 [0 1]
87 [0 1]
88 [0 1]
89 [0]
90 [0 1]
91 [0 1]
92 [1 0]
93 [0 1]
94 [0 1]
95 [0 1]
96 [0 1]
97 [0 1]
98 [0 1]
99 [0 1]
100 [0 1]
101 [0 1]
102 [0 1]
103 [0 1]
104 [0 1]
105 [0 1]
106 [0 1]
107 [0 1]
108 [0]
109 [0 1]
110 [0 1]
111 [0 1]
112 [0]
11

959 [0 1]
960 [0 1]
961 [0 1]
962 [0 1]
963 [0]
964 [0 1]
965 [0 1]
966 [0 1]
967 [0 1]
968 [0 1]
969 [0 1]
970 [0 1]
971 [0 1]
972 [0 1]
973 [0 1]
974 [0 1]
975 [0 1]
976 [0 1]
977 [0]
978 [0 1]
979 [0 1]
980 [0 1]
981 [0 1]
982 [0 1]
983 [0 1]
984 [1 0]
985 [0]
986 [0 1]
987 [0 1]
988 [0 1]
989 [0 1]
990 [0 1]
991 [0 1]
992 [0 1]
993 [0 1]
994 [0 1]
995 [0 1]
996 [0 1]
997 [0 1]
998 [0 1]
999 [0 1]
1000 [0 1]
1001 [0 1]
1002 [0 1]
1003 [0 1]
1004 [0 1]
1005 [0 1]
1006 [0 1]
1007 [0 1]
1008 [0 1]
1009 [0]
1010 [0 1]
1011 [0 1]
1012 [0 1]
1013 [0 1]
1014 [0 1]
1015 [0 1]
1016 [0 1]
1017 [0 1]
1018 [0 1]
1019 [0 1]
1020 [0 1]
1021 [0 1]
1022 [0 1]
1023 [0 1]
1024 [0 1]
1025 [0 1]
1026 [0 1]
1027 [0 1]
1028 [0 1]
1029 [0 1]
1030 [0]
1031 [0]
1032 [0 1]
1033 [0 1]
1034 [0]
1035 [0 1]
1036 [0 1]
1037 [0 1]
1038 [0 1]
1039 [0 1]
1040 [0 1]
1041 [0 1]
1042 [0 1]
1043 [0 1]
1044 [0 1]
1045 [0 1]
1046 [0 1]
1047 [0 1]
1048 [0 1]
1049 [0 1]
1050 [0 1]
1051 [1 0]
1052 [0 1]
1053 [0 1]
1054 [0 1]

In [278]:
# Generate OH version of DS, not needed as all categorical features are binary
dfiadvCAT_OH = dfiadvCAT

In [279]:
# Rename and Reorder columns, not needed as all categorical features are binary
dfiadvCAT_OH = dfiadvCAT_OH

In [280]:
# Rename Numerical columns to not have repeated column names
dfiadvNUM.columns = [c+dfiadvCAT.shape[1] for c in range(dfiadvNUM.shape[1])]
dfiadvNUM_NORM.columns = [c+dfiadvCAT.shape[1] for c in range(dfiadvNUM_NORM.shape[1])]

In [281]:
# Create non-treated dataset
pd.concat([dfiadvCAT, dfiadvNUM, dfiadvOUT], axis=1).to_csv("./data/InternetAdv.csv", index=False)

In [282]:
# Create treated datasets
pd.concat([dfiadvCAT_OH, dfiadvNUM, dfiadvOUT], axis=1).to_csv("./data/OH_InternetAdv.csv", index=False)
pd.concat([dfiadvCAT, dfiadvNUM_NORM, dfiadvOUT], axis=1).to_csv("./data/NORM_InternetAdv.csv", index=False)
pd.concat([dfiadvCAT_OH, dfiadvNUM_NORM, dfiadvOUT], axis=1).to_csv("./data/OH_NORM_InternetAdv.csv", index=False)

In [283]:
###########################################################################

In [284]:
# Statlog GC
# After Clean:
# Numerical [1, 4, 7, 10, 12, 15, 17]
# Categorical [0, 2, 3, 5, 6, 8, 9, 11, 13, 14,16, 18, 19]

In [285]:
# Read raw data
dfslgc = pd.read_csv("./rawData/german.data", header=None, sep=" ")
dfslgc[20] = dfslgc[20].map({1:0, 2:1})

In [286]:
# Create and rename columns of Numerical DF
dfslgcNUM = dfslgc[[1, 4, 7, 10, 12, 15, 17]].copy()
dfslgcNUM.columns = [*range(dfslgcNUM.shape[1])]

In [287]:
# Create and remane columns of Categorical DF
dfslgcCAT = dfslgc[[0, 2, 3, 5, 6, 8, 9, 11, 13, 14,16, 18, 19]].copy()
dfslgcCAT.columns = [*range(dfslgcCAT.shape[1])]

In [288]:
# Create output column
dfslgc.rename(columns={20: "output"}, inplace=True)
dfslgcOUT = dfslgc["output"].copy()

In [289]:
# Treat Numerical Variables

In [290]:
# Verify columns with non-numeric entries
verifyCell(dfslgcNUM)

In [291]:
# Delete columns without all numbers, except output

In [292]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfslgcNUM = renameReorderNonOH(dfslgcNUM)



In [293]:
# Find binary columns
dL = findBinaryColumns(dfslgc)

17 [1 2]
18 ['A192' 'A191']
19 ['A201' 'A202']
output [0 1]
[17, 18, 19, 'output']


In [294]:
dfslgc[17] = dfslgc[17].map({1:0, 2:1})
dfslgc[18] = dfslgc[18].map({'A192':0, 'A191':1})
dfslgc[19] = dfslgc[19].map({'A201':0, 'A202':1})

In [295]:
dL = findBinaryColumns(dfslgc)

17 [0 1]
18 [0 1]
19 [0 1]
output [0 1]
[17, 18, 19, 'output']


In [296]:
# Save non-OH dataset

In [297]:
# Generate Normalized version of data
dfslgcNUM_NORM = dfslgcNUM.copy()
dfslgcNUM_NORM.loc[:,:7] = scaler.fit_transform(dfslgcNUM_NORM)
pickle.dump(scaler, open(f'./scalers/StatlogGC.sc', 'wb'))

In [298]:
# Treat Categorical Variables

In [299]:
# Rename and Reorder columns

In [300]:
dL = findBinaryColumns(dfslgcCAT)

11 ['A192' 'A191']
12 ['A201' 'A202']
[11, 12]


In [301]:
# Adjust binary results to only have 0 and/or 1
dfslgcCAT[11] = dfslgcCAT[11].map({"A192":0, "A191":1})
dfslgcCAT[12] = dfslgcCAT[12].map({"A201":0, "A202":1})

In [302]:
dL = findBinaryColumns(dfslgcCAT)

11 [0 1]
12 [0 1]
[11, 12]


In [303]:
# Generate OH version of DS
dfslgcCAT_OH = pd.get_dummies(dfslgcCAT, columns=getNoBinaryOut(dfslgcCAT, dL))

In [304]:
# Rename and Reorder columns, no output was found because it's separated in another variable
dfslgcCAT_OH = renameReorderOH(dfslgcCAT_OH)



In [305]:
# Rename Numerical columns to not have repeated column names
dfslgcNUM.columns = [c+dfslgcCAT.shape[1] for c in range(dfslgcNUM.shape[1])]
dfslgcNUM_NORM.columns = [c+dfslgcCAT.shape[1] for c in range(dfslgcNUM_NORM.shape[1])]

In [306]:
# Create non-treated dataset
pd.concat([dfslgcCAT, dfslgcNUM, dfslgcOUT], axis=1).to_csv("./data/StatlogGC.csv", index=False)

In [307]:
# Create treated datasets
pd.concat([dfslgcCAT_OH, dfslgcNUM, dfslgcOUT], axis=1).to_csv("./data/OH_StatlogGC.csv", index=False)
pd.concat([dfslgcCAT, dfslgcNUM_NORM, dfslgcOUT], axis=1).to_csv("./data/NORM_StatlogGC.csv", index=False)
pd.concat([dfslgcCAT_OH, dfslgcNUM_NORM, dfslgcOUT], axis=1).to_csv("./data/OH_NORM_StatlogGC.csv", index=False)