# Simple Regression Notebook

### DISCLAIMER:

<strong>This Notebook is part of a Capstone Project. The analysis, preprocessing steps, as well as scaling, encoding and transformation, and other data pipeline steps are intentially left out to demonstrate the difference between simplistic regression models and more advanced regression models which are included in other notebooks within this repository. This notebook intentionally does not adhere to best practices.</strong>

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
def plot_results():
    error_list = [abs(i-j) for i,j in zip(Y_test,y_test_pred)]
    result_df = pd.DataFrame(zip(Y_test,y_test_pred,error_list),columns=(["Ground Truth","Prediction","Absolute Error"]))
    result_df = result_df[(result_df["Ground Truth"] > 0) & (result_df["Ground Truth"] < 150)]
    
    fig = px.scatter(result_df,x="Ground Truth", y="Prediction",width=1500, height=600,
                     labels=dict(x="Ground Truth", y="Prediction"), color="Absolute Error")


    fig.update_xaxes(title_font=dict(size=35, color='black'))
    fig.update_yaxes(title_font=dict(size=35, color='black'))
    fig.update_xaxes(zeroline=False)
    fig.update_yaxes(zeroline=False)

    fig.update_xaxes(range=[0, 100])
    fig.update_yaxes(range=[0, 120])

    title="Regression Prediction Results compared to Ground Truth"

    fig.show()

In [None]:
df = pd.read_csv("uber_preprocessed.csv")
df.drop("Unnamed: 0",inplace=True,axis=1)

In [None]:
df.head()

In [None]:
# Drop Columns that are not relevant for prediction
df.drop(["date","time","minute","second"],axis=1,inplace=True)

In [None]:
target = "fare_amount"
features = [col for col in df.columns.tolist() if col != "fare_amount"]

X = df[features]
Y = df[target]

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.33, random_state=42)

In [None]:
#Create Linear Regression Object
LR = LinearRegression()

In [None]:
model = LR.fit(X_train, Y_train)

In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
def error_dataframe():

In [None]:
# Error DataFrame
error_df = []
error_df.append(pd.Series({'train': mean_squared_error(Y_train, y_train_pred),
                           'test' : mean_squared_error(Y_test,  y_test_pred)},
                           name='Simple Regression'))

In [None]:
pd.DataFrame(error_df)

In [None]:
plot_results()