# Lecture 14 Notebook

Data 100, Spring 2023

[Acknowledgments Page](https://ds100.org/sp23/acks/)

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
np.random.seed(42)

#plt.rcParams['figure.figsize'] = (12, 9)

sns.set()
#sns.set_context('talk')

In [None]:
tips_df = sns.load_dataset("tips")
tips_df

In [None]:
three_features = ['total_bill', 'size', 'day']

three_feature_df = pd.DataFrame(tips_df[three_features])
random_rows = [193, 90, 25, 26, 190]

three_feature_df.iloc[random_rows, :]

sklearn OneHotEncoder [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out)

In [None]:
from sklearn.preprocessing import OneHotEncoder
oh_enc = OneHotEncoder()

In [None]:
oh_enc.fit(tips_df[['day']])
oh_enc

In [None]:
dummies = oh_enc.transform(tips_df[['day']])
dummies

Sparse matrices are lightweight solutions to storing matrices with many zero elements.<br/>
(Why would this be useful for one-hot encoded data?)

In [None]:
dummies.toarray().shape # transform to regular NumPy array

<br/><br/>

Let's compare our one-hot encoded features to the original `day` feature.<br/>

In [None]:
random_rows

In [None]:
dummies.toarray()[random_rows,:]

In [None]:
tips_df.loc[random_rows,:]


Note the days of week are "out of order," because sklearn doesn't "know" that Thursday comes before Friday, etc.

We can use `.get_feature_names_out()` to get sklearn's one-hot encoding order ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder.get_feature_names_out)):

In [None]:
oh_enc.get_feature_names_out()

### Putting it all together

In [None]:
from sklearn.preprocessing import OneHotEncoder

oh_enc = OneHotEncoder()
oh_enc.fit(tips_df[['day']])

ohe_data = oh_enc.transform(tips_df[['day']]).toarray()
data_w_ohe = (tips_df[three_features]
              .join(
                  pd.DataFrame(ohe_data, columns=oh_enc.get_feature_names_out(), index=tips_df.index)))
data_w_ohe = data_w_ohe.drop(columns=["day"]) # why do we need to do this before calling fit?
data_w_ohe.loc[random_rows,:]


<br/><br/>

Now fitting the model with one-hot encodings:

In [None]:
from sklearn.linear_model import LinearRegression
f_with_day = LinearRegression(fit_intercept=False)
f_with_day.fit(data_w_ohe, tips_df["tip"])

In [None]:
# total_bill, size, day_Fri, day_Sat, day_Sun, day_Thur
f_with_day.predict([[50, 3, 1, 0, 0, 0]])

In [None]:
f_with_day.intercept_

In [None]:
f_with_day.coef_

## High Order Polynomial Example

The code below used to generate the lecture slide plots uses two out of scope syntax concepts:
- The sklearn Pipeline class ([documentation](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline))
- The sklearn PolynomialFeatures transformer ([documentatoin](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html))


In [None]:
vehicle_data = sns.load_dataset("mpg")
vehicle_data = vehicle_data.rename(columns = {"horsepower": "hp"})
vehicle_data = vehicle_data.dropna()

In [None]:
vehicle_data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

def get_MSE_for_degree_k_model(k):
    pipelined_model = Pipeline([
        ('poly_transform', PolynomialFeatures(degree = k)),
        ('regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    return mean_squared_error(pipelined_model.predict(vehicle_data[["hp"]]), vehicle_data["mpg"])

In [None]:
ks = np.array(range(0, 7))
MSEs = [get_MSE_for_degree_k_model(k) for k in ks]
MSEs_and_k = pd.DataFrame({"k": ks, "MSE": MSEs})
MSEs_and_k.set_index("k")

In [None]:
def plot_degree_k_model(k, MSEs_and_k, axs):
    pipelined_model = Pipeline([
        ('poly_transform', PolynomialFeatures(degree = k)),
        ('regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    
    row = k // 3
    col = k % 3
    ax = axs[row, col]
    
    sns.scatterplot(data=vehicle_data, x='hp', y='mpg', ax=ax)
    
    x_range = np.linspace(45, 210, 100).reshape(-1, 1)
    ax.plot(x_range, pipelined_model.predict(pd.DataFrame(x_range, columns=['hp'])), c='orange', linewidth=2)
    
    ax.set_ylim((0, 50))
    mse_str = f"MSE: {MSEs_and_k.loc[k, 'MSE']:.4}\norder: {k}"
    ax.text(150, 40, mse_str, dict(size=16))

fig = plt.figure(figsize=(12, 6))
axs = fig.subplots(nrows=2, ncols=3)

for k in range(6):
    plot_degree_k_model(k, MSEs_and_k, axs)
fig.tight_layout()

fig.savefig('higherorder')

In [None]:
import plotly.graph_objects as go


def plot_degree_k_model(k):
    pipelined_model = Pipeline([
        ('poly_transform', PolynomialFeatures(degree = k)),
        ('regression', LinearRegression(fit_intercept = True))    
    ])
    pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])
    
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=vehicle_data['hp'], y = vehicle_data['mpg'], 
                        mode = "markers", name = ""))
    
    x_range = np.linspace(45, 210, 100)

    fig.add_trace(go.Scatter(x=x_range, y = pipelined_model.predict(x_range.reshape(-1, 1)), 
                         mode = "lines", name = ""))
    
    fig.update_layout(font_size = 20,
                  xaxis_title = "hp",
                  yaxis_title = "mpg",
                  margin=dict(l=50, r=50, b=0, t=1),
                  showlegend = False)
    return fig

In [None]:
plot_degree_k_model(2)