In [39]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import OneHotEncoder

In [38]:
train = pd.read_csv("train_for_students.csv")
test = pd.read_csv("test_for_students.csv")

## Pre-processing
Some of our data is categorical feature data. This won't work for our models relying on numerical data.
Therefore we need to encode these string features into some kind of numerical value.

For string data that seems to be ordered, we'll use **[ordinal encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder)** to convey that this information has an order to it.

For string data that doesn't seem to be ordered, we'll use **[one-hot encoding](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder)** so the model doesn't interpret these features as having an order.

**ordered string features: f3, f11**

**unordered string features: f1, f5, f7, f9, f16** (some of these could technically be considered ordered... but I didn't think they were)

In [91]:
# manually doing ordinal encoding for f3, f11
f3_mapping = {"Low":0, "Medium":1, "High":2}
train["f3"] = train["f3"].map(f3_mapping)
# print(train["f3"])

f11_mapping = {"Basic":0, "Standard":1, "Premium":2}
train["f11"] = train["f11"].map(f11_mapping)
# print(train["f11"])


# Implementing one-hot encoding with sklearn for f1, f5, f7, f9, f16
# NOTE: for however many categories there are in a feature (for example, f1 has 3 categories: Bird, Cat, Dog),
#  We'll need to add this amount of columns. For example, f1 gets turned into 3 feature columns after one-hot encoding
one_hot = OneHotEncoder(sparse_output = False)
one_hot_features = ["f1", "f5", "f7", "f9", "f16"]
one_hot.fit(train[one_hot_features])

for feature in one_hot_features:
    one_hot.fit(train[[feature]])
    train[feature] = one_hot.transform(train[[feature]])
    test[feature] = one_hot.transform(test[[feature]])
    
"""
one hot encoding is essentially a collection of mappings from each set of features to a set of one-hot vectors.

we need to make sure we 
"""

# splitting features
X_train = train.drop(columns = ["target"])
y_train = train["target"]

X_test = test.drop(columns=["target"])
y_test = test["target"]

ValueError: Found unknown categories ['Bird', 'Cat', 'Dog'] in column 0 during transform

In [4]:
# Logistic Regression (for binary classification)

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)
log_prob = log_model.predict_proba(X_test)

ValueError: could not convert string to float: 'Cat'