# One-Hot Encoding

In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from seaborn import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

In [20]:
flights = load_dataset('flights')

## Data

In [21]:
#check head of dataset
flights.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


## Split
Am trying to predict passengers based on year and month

In [22]:
# Train-test split. Year and month as X, passengers as y
X_train, X_test, y_train, y_test =\
    train_test_split(flights[['year', 'month']],
                     flights['passengers'],
                     random_state=42)

### One-Hot Encode!

In [23]:
#Instantiate the encoder and fit
ohe = OneHotEncoder()
columns_to_encode = ['month']
ohe.fit(X_train[columns_to_encode])

In [24]:
#Transform the column
encoded = ohe.transform(X_train[columns_to_encode])
encoded

<108x12 sparse matrix of type '<class 'numpy.float64'>'
	with 108 stored elements in Compressed Sparse Row format>

### Inflating

In [25]:
#Inflate!
encoded.todense()

matrix([[1., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 1., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 1.],
        [0., 0., 0., ..., 0., 0., 0.]])

### Getting New Feature Names

In [26]:
#Access the feature names
ohe.get_feature_names_out()

array(['month_Apr', 'month_Aug', 'month_Dec', 'month_Feb', 'month_Jan',
       'month_Jul', 'month_Jun', 'month_Mar', 'month_May', 'month_Nov',
       'month_Oct', 'month_Sep'], dtype=object)

### Put into Dataframe

In [27]:
#Turn into a Dataframe
new_train_df = pd.DataFrame(encoded.todense(),
                            columns=ohe.get_feature_names_out(),
                            index= X_train.index)

new_train_df.head()

Unnamed: 0,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
111,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
127,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### All Together
#### Stick together with X_train and drop the dummied-out column

In [28]:
df_train_concat = pd.concat([X_train, new_train_df],
                            axis=1).drop('month', axis=1)
df_train_concat.head()

Unnamed: 0,year,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
111,1958,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
42,1952,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
118,1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
127,1959,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,1957,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Model Training

In [29]:
#Fit a linear Regression model
lr = LinearRegression()
lr.fit(df_train_concat, y_train)

In [30]:
#score it
lr.score(df_train_concat, y_train)

0.9578728640256422

## Test Set

In [31]:
test_encoded = ohe.transform(X_test[columns_to_encode])

In [32]:
new_test_df = pd.DataFrame(test_encoded.todense(),
                           columns=ohe.get_feature_names_out(),
                           index= X_test.index)

new_test_df.head()

Unnamed: 0,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Final Test Prep

In [33]:
df_test_concat = pd.concat([X_test, new_test_df],
                           axis=1).drop('month', axis=1)
df_test_concat.head()

Unnamed: 0,year,month_Apr,month_Aug,month_Dec,month_Feb,month_Jan,month_Jul,month_Jun,month_Mar,month_May,month_Nov,month_Oct,month_Sep
117,1958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
19,1950,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82,1955,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
97,1957,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
56,1953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## Model Score on Test

In [34]:
lr.score(df_test_concat, y_test)

0.9352318155740923