In [39]:
import pandas as pd
import numpy as np


from sklearn.preprocessing import OneHotEncoder

## Pre-processing
Some of our data is categorical feature data. This won't work for our models relying on numerical data.
Therefore we need to encode these string features into some kind of numerical value.

For string data that seems to be ordered, we'll use **[ordinal encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder)** to convey that this information has an order to it.

For string data that doesn't seem to be ordered, we'll use **[one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)** so the model doesn't interpret these features as having an order.

* Note that OneHotEncoder() by default sorts String categorical features alphabetically

**ordered string features: f3, f11**

**unordered string features: f1, f5, f7, f9, f16** (some of these could technically be considered ordered... but I didn't think they were)

In [138]:
train = pd.read_csv("train_for_students.csv")
test = pd.read_csv("test_for_students.csv")


# manually doing ordinal encoding for f3, f11
f3_mapping = {"Low":0, "Medium":1, "High":2}
train["f3"] = train["f3"].map(f3_mapping)
# print(train["f3"])

f11_mapping = {"Basic":0, "Standard":1, "Premium":2}
train["f11"] = train["f11"].map(f11_mapping)
# print(train["f11"])


# Implementing one-hot encoding with sklearn for f1, f5, f7, f9, f16
# NOTE: for however many categories there are in a feature (for example, f1 has 3 categories: Bird, Cat, Dog),
#  We'll need to add this amount of columns. For example, f1 gets turned into 3 feature columns after one-hot encoding
one_hot = OneHotEncoder(sparse_output = False)
    
# print(train["f1"])


# encoding f1
one_hot.fit(train[["f1"]])
f1_encode_train = one_hot.transform(train[["f1"]])
f1_encode_test = one_hot.transform(test[["f1"]])
# f1_Bird_col = [f1_encode[i][0] for i in range(len(f1_encode))]
# print(f1_Bird_col)
train.insert(1, "f1_Bird", [f1_encode_train[i][0] for i in range(len(f1_encode_train))])
train.insert(2, "f1_Cat", [f1_encode_train[i][1] for i in range(len(f1_encode_train))])
train.insert(3, "f1_Dog", [f1_encode_train[i][2] for i in range(len(f1_encode_train))])
test.insert(1, "f1_Bird", [f1_encode_test[i][0] for i in range(len(f1_encode_test))])
test.insert(2, "f1_Cat", [f1_encode_test[i][1] for i in range(len(f1_encode_test))])
test.insert(3, "f1_Dog", [f1_encode_test[i][2] for i in range(len(f1_encode_test))])

# encoding f5
# train["f5"] is now the eighth column
one_hot.fit(train[["f5"]])
f5_encode_train = one_hot.transform(train[["f5"]])
f5_encode_test = one_hot.transform(test[["f5"]])

train.insert(8, "f5_Blue", [f5_encode_train[i][0] for i in range(len(f5_encode_train))])
train.insert(9, "f5_Green", [f5_encode_train[i][1] for i in range(len(f5_encode_train))])
train.insert(10, "f5_Red", [f5_encode_train[i][2] for i in range(len(f5_encode_train))])    
test.insert(8, "f5_Blue", [f5_encode_test[i][0] for i in range(len(f5_encode_test))])
test.insert(9, "f5_Green", [f5_encode_test[i][1] for i in range(len(f5_encode_test))])
test.insert(10, "f5_Red", [f5_encode_test[i][2] for i in range(len(f5_encode_test))])    

# encoding f7
# train["f7"] is now the 13th column
one_hot.fit(train[["f7"]]) 
f7_encode_train = one_hot.transform(train[["f7"]])
f7_encode_test = one_hot.transform(test[["f7"]])
train.insert(13, "f7_Maybe", [f7_encode_train[i][0] for i in range(len(f7_encode_train))])
train.insert(14, "f7_No", [f7_encode_train[i][1] for i in range(len(f7_encode_train))])
train.insert(15, "f7_Yes", [f7_encode_train[i][2] for i in range(len(f7_encode_train))])
test.insert(13, "f7_Maybe", [f7_encode_test[i][0] for i in range(len(f7_encode_test))])
test.insert(14, "f7_No", [f7_encode_test[i][1] for i in range(len(f7_encode_test))])
test.insert(15, "f7_Yes", [f7_encode_test[i][2] for i in range(len(f7_encode_test))])

# encoding f9
# train["f9"] is now the 17th column
one_hot.fit(train[["f9"]])
f9_encode_train = one_hot.transform(train[["f9"]])
f9_encode_test = one_hot.transform(test[["f9"]])
train.insert(17, "f9_X", [f9_encode_train[i][0] for i in range(len(f9_encode_train))])
train.insert(18, "f9_Y", [f9_encode_train[i][1] for i in range(len(f9_encode_train))])
train.insert(19, "f9_Z", [f9_encode_train[i][2] for i in range(len(f9_encode_train))])
test.insert(17, "f9_X", [f9_encode_test[i][0] for i in range(len(f9_encode_test))])
test.insert(18, "f9_Y", [f9_encode_test[i][1] for i in range(len(f9_encode_test))])
test.insert(19, "f9_Z", [f9_encode_test[i][2] for i in range(len(f9_encode_test))])

# encoding f16
# train["f16"] is now the 21st column
one_hot.fit(train[["f16"]])
f16_encode_train = one_hot.transform(train[["f16"]])
f16_encode_test = one_hot.transform(test[["f16"]])
train.insert(21, "f16_A", [f16_encode_train[i][0] for i in range(len(f16_encode_train))])
train.insert(22, "f16_B", [f16_encode_train[i][1] for i in range(len(f16_encode_train))])
train.insert(23, "f16_C", [f16_encode_train[i][2] for i in range(len(f16_encode_train))])
test.insert(21, "f16_A", [f16_encode_test[i][0] for i in range(len(f16_encode_test))])
test.insert(22, "f16_B", [f16_encode_test[i][1] for i in range(len(f16_encode_test))])
test.insert(23, "f16_C", [f16_encode_test[i][2] for i in range(len(f16_encode_test))])


# saving encoded train and test data to new csv's

train.to_csv("enc_train", index=False)
test.to_csv("enc_test", index=False)


# print(train)


ValueError: could not convert string to float: 'Cat'