In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing

In [2]:
# Read in the datas
training = pd.read_csv("./dat/train.csv")
evaluation = pd.read_csv("./dat/test.csv")

In [3]:
# There are 116 categorical columns of the form: cat1-cat116
catCols = [x for x in training.columns if "cat" in x]

# There are 14 continuous columns of the form cont1-cont14
contCols = [x for x in training.columns if "cont" in x]

In [4]:
# Create label encoder model dictionary and populate dict with label encoding models
leDict = {}
for catCol in catCols:
    leDict[catCol] = preprocessing.LabelEncoder().fit(pd.concat([training[catCol], evaluation[catCol]]))

In [5]:
# Apply encoding models to categorical variables
training[catCols] = training[catCols].apply(lambda x: leDict[x.name].transform(x))
evaluation[catCols] = evaluation[catCols].apply(lambda x: leDict[x.name].transform(x))

# Save off non-scaled versions of the DFs
training_noscale = training
evaluation_noscale = evaluation

# Now we want to scale all of the columns
min_max_scaler = preprocessing.MinMaxScaler()

# fit the scaler once on all the data
min_max_scaler.fit(pd.concat([training[catCols + contCols], evaluation[catCols + contCols]]))

# apply the scaler
training[catCols + contCols] = min_max_scaler.transform(training[catCols + contCols])
evaluation[catCols + contCols] = min_max_scaler.transform(evaluation[catCols + contCols])

In [7]:
# Output data
training_noscale.to_csv("./dat/training_encoded.csv", index=False)
evaluation_noscale.to_csv("./dat/evaluation_encoded.csv", index=False)
training.to_csv("./dat/training_encoded_scaled.csv", index=False)
evaluation.to_csv("./dat/evalutation_encoded_scaled.csv", index=False)