In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Import the data
df = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m14/lesson_2/datasets/text-data.csv')
df.head()

Unnamed: 0,backpack_color,grade,favorite_creature,arrived
0,blue,B,elf,on time
1,blue,C,griffin,late
2,blue,D,dragon,late
3,yellow,B,griffin,on time
4,blue,F,griffin,late


In [3]:
# Create X and y and split into training and testing sets
X = df.drop(columns='arrived')
y = df['arrived']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

In [4]:
# Decide how to encode the backpack_color column
X_train['backpack_color'].value_counts()

blue      37
yellow    21
red       17
Name: backpack_color, dtype: int64

In [5]:
# Create an encoder for the backpack_color column
backpack_color_ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# Train the encoder
backpack_color_ohe.fit(X_train['backpack_color'].values.reshape(-1,1))


In [6]:
# Decide how to encode the grade column
df['grade'].value_counts()



F    22
C    21
B    20
A    19
D    18
Name: grade, dtype: int64

In [7]:
# Create an encoder for the backpack_color column
grade_ord_enc = OrdinalEncoder(categories = [['F', 'D', 'C', 'B', 'A']], encoded_missing_value=-1, handle_unknown='use_encoded_value', unknown_value=-1)

# Train the encoder
grade_ord_enc.fit(X_train['grade'].values.reshape(-1,1))

In [8]:
# Decide how to encode the favorite_creature column
df['favorite_creature'].value_counts()

griffin              38
dragon               38
basilisk              4
vampire               4
fairy                 3
moth man              3
elf                   2
sphinx                2
loch ness monster     2
chupacabra            2
jackalope             1
troll                 1
Name: favorite_creature, dtype: int64

In [9]:
# Create an encoder for the backpack_color column
creature_ohe = OneHotEncoder(handle_unknown='infrequent_if_exist', sparse_output=False, min_frequency=0.2)

# Train the encoder
creature_ohe.fit(X_train['favorite_creature'].values.reshape(-1,1))

In [10]:
# Create a function using the pretrained encoders to use on
# any new data (including the testing data)

def X_preprocess(X_data):
    # Transform each column into numpy arrays
    backpack_color_encoded = backpack_color_ohe.transform(X_data['backpack_color'].values.reshape(-1,1))
    grade_encoded = grade_ord_enc.transform(X_data['grade'].values.reshape(-1,1))
    favorite_creature_encoded = creature_ohe.transform(X_data['favorite_creature'].values.reshape(-1,1))

    # Reorganize the numpy arrays into a DataFrame
    backpack_color_df = pd.DataFrame(backpack_color_encoded, columns = backpack_color_ohe.get_feature_names_out())
    creature_df = pd.DataFrame(favorite_creature_encoded, columns= creature_ohe.get_feature_names_out())
    out_df = pd.concat([backpack_color_df, creature_df], axis = 1)
    out_df['grade'] = grade_encoded

    # Return the DataFrame
    return out_df


In [11]:
# Preprocess the training data
X_preprocess(X_train)

Unnamed: 0,x0_red,x0_yellow,x0_dragon,x0_griffin,x0_infrequent_sklearn,grade
0,1.0,0.0,0.0,1.0,0.0,3.0
1,0.0,0.0,1.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,1.0
3,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,2.0
...,...,...,...,...,...,...
70,0.0,1.0,1.0,0.0,0.0,3.0
71,0.0,0.0,0.0,0.0,1.0,4.0
72,0.0,1.0,1.0,0.0,0.0,2.0
73,0.0,1.0,1.0,0.0,0.0,2.0


In [12]:
# Preprocess the testing data
X_preprocess(X_test)

Unnamed: 0,x0_red,x0_yellow,x0_dragon,x0_griffin,x0_infrequent_sklearn,grade
0,0.0,1.0,1.0,0.0,0.0,3.0
1,1.0,0.0,0.0,1.0,0.0,3.0
2,1.0,0.0,0.0,0.0,1.0,1.0
3,1.0,0.0,1.0,0.0,0.0,4.0
4,1.0,0.0,0.0,0.0,1.0,1.0
5,0.0,1.0,1.0,0.0,0.0,1.0
6,0.0,0.0,0.0,0.0,1.0,4.0
7,0.0,1.0,0.0,0.0,1.0,2.0
8,0.0,0.0,0.0,1.0,0.0,4.0
9,0.0,1.0,1.0,0.0,0.0,1.0
