# Intermediate Regression Notebook

### DISCLAIMER:

<strong>This Notebook is part of a Capstone Project. Some analysis & preprocessing steps, as well as scaling, encoding and transformation, and other data pipeline steps are intentially left out to demonstrate the difference between simplistic regression models and more advanced regression models which are included in other notebooks within this repository. This notebook intentionally does not adhere to best practices.</strong>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import calendar
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
def plot_results():
    error_list = [abs(i-j) for i,j in zip(Y_test,y_test_pred)]
    result_df = pd.DataFrame(zip(Y_test,y_test_pred,error_list),columns=(["Ground Truth","Prediction","Absolute Error"]))
    result_df = result_df[(result_df["Ground Truth"] > 0) & (result_df["Ground Truth"] < 150)]
    
    fig = px.scatter(result_df,x="Ground Truth", y="Prediction",width=1500, height=600,
                     labels=dict(x="Ground Truth", y="Prediction"), color="Absolute Error")


    fig.update_xaxes(title_font=dict(size=35, color='black'))
    fig.update_yaxes(title_font=dict(size=35, color='black'))
    fig.update_xaxes(zeroline=False)
    fig.update_yaxes(zeroline=False)

    fig.update_xaxes(range=[0, 100])
    fig.update_yaxes(range=[0, 120])

    title="Regression Prediction Results compared to Ground Truth"

    fig.show()
    
def data_types(df):
    return pd.DataFrame(df.dtypes.value_counts(),columns=(["count"]))

def check_num_OHC(df):
    categorical_cols = df.columns[df.dtypes == object]
    num_OHC_cols = pd.DataFrame(df[categorical_cols]
                .apply(lambda x: x.nunique())
                .sort_values(ascending=False), columns=(["count"]))
    
    #if column only has one category, column is irrelevant for encoding
    num_OHC_cols = num_OHC_cols["count"].loc[num_OHC_cols["count"]>=2]
    
    num_OHC_cols -=1
    
    return num_OHC_cols.sum()

def create_OHC_dataframe(df):
    
    #create copy of dataframe
    df_OHC = df.copy()
    
    #create encoder objects
    le = LabelEncoder()
    ohc = OneHotEncoder()

    #create filter for categorical columns
    categorical_cols = df.columns[df.dtypes == object]
    num_OHC_cols = (df[categorical_cols]
                    .apply(lambda x: x.nunique())
                    .sort_values(ascending=False))

    for col in num_OHC_cols.index:

        # Integer encode the string categories
        encoded_cols = le.fit_transform(df_OHC[col]).astype(np.int)

        # Remove the original column from the dataframe
        df_OHC = df_OHC.drop(col, axis=1)

        # One hot encode the data--this returns a sparse array
        OHC_cols = ohc.fit_transform(encoded_cols.reshape(-1,1))

        # Create unique column names
        num_cols = OHC_cols.shape[1]
        col_names = ['_'.join([col, str(x)]) for x in range(num_cols)]

        # Create the new dataframe
        new_df = pd.DataFrame(OHC_cols.toarray(), 
                              index=df_OHC.index, 
                              columns=col_names)

        # Append the new data to the dataframe
        df_OHC = pd.concat([df_OHC, new_df], axis=1)

    return df_OHC

In [None]:
df = pd.read_csv("uber_preprocessed.csv")
df.drop("Unnamed: 0",inplace=True,axis=1)

# Drop Columns that are not relevant for prediction
df.drop(["date","time","minute","second"],axis=1,inplace=True)

df.year = df.year.apply(lambda x: str(x))
df.month = df.month.apply(lambda x: calendar.month_name[x])
df.day_of_week = df.day_of_week.apply(lambda x: calendar.day_name[x])
df.day = df.day.apply(lambda x: str(x))
df.hour = df.hour.apply(lambda x: str(x))

In [None]:
df.head()

In [None]:
data_types(df)

In [None]:
print("One-Hot_Encoding would add ",check_num_OHC(df)," Columns to the Dataframe")

One Hot Encoding is used for the following columns:
- year
- month
- day
- hour
- day of week

In [None]:
df = create_OHC_dataframe(df)

In [None]:
df.head()

In [None]:
target = "fare_amount"
features = [col for col in df.columns.tolist() if col != "fare_amount"]

X = df[features]
Y = df[target]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.33, random_state=42)

## Polynomial Features & Scaling

In [None]:
#Scaler
scaler = StandardScaler()
#scaler = MinMaxScaler()

In [None]:
# Linear Regression Object
LR = LinearRegression()

In [None]:
best_estimator = Pipeline([
                    ("scaler", scaler),
                    ("make_higher_degree", PolynomialFeatures(degree=2)),
                    ("Linear Regression", LR)])

In [None]:
model = best_estimator.fit(X_train,Y_train)

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
# Error DataFrame
error_df = []
error_df.append(pd.Series({'train': mean_squared_error(Y_train, y_train_pred),
                           'test' : mean_squared_error(Y_test,  y_test_pred)},
                           name='Intermediate Regression'))
pd.DataFrame(error_df)

In [None]:
plot_results()