# Regression Model

Lets create a simple linear regression model 

In [None]:
# handling data
import numpy as np
import pandas as pd

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# data visualization
import seaborn as sns
import matplotlib.pyplot as plt

We will use a subset of the the chicago taxi dataset. This subset consists in 31,694 trips that happened in may 2022.

In [None]:
# read dataset
chicago_taxi_dataset = pd.read_csv("datasets/chicago_taxi_trips_may_2022.csv", index_col=0)

# divide features (X) from label (y)
# we will choose only 3 variables to keep it simple, 
# two continuous and one categorical to make it intresting.
X = chicago_taxi_dataset[['TRIP_MILES', 'TRIP_SECONDS', 'PAYMENT_TYPE']].copy() # dataframe
y = chicago_taxi_dataset['FARE'].copy() # pandas series

# feature engineer I: seconds to minutes
# minutes has much bigger mean value than trip miles and fare,
# so we will reduce it to minutes to keep numbers close
# and help the model
X['TRIP_MINUTES'] = X['TRIP_SECONDS']/60

# print a resume of the dataset
# count = num. of non-null values
# unique = num. of unique values in categorical variables
#top = Most frequent value of categorical variables (mode)
#freq = Frequency of the top
X.describe(include='all')

We will do: 
- train test split 80-20.
- one hot encoding to categorical variable

In [None]:
# splits
# NOTE: splits have to be done before encoding to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X[['TRIP_MILES', 'TRIP_MINUTES', 'PAYMENT_TYPE']],
                                                    y,
                                                    test_size=0.2, 
                                                    random_state=100)

# feature engineer II: one-hot encoding of Payment type
# we group variables with less than 2000 events to also group
# the category unknown which has 1,206 events. This categories will have the name
# 'PAYMENT_TYPE_infrequent_sklearn'.
# NOTE: handle_unknown helps to avoid problems if new cateogries appear in the test split
encoder = OneHotEncoder(sparse_output=False, # so it returns a np matrix
                        drop='first', # to avoid colineality
                        min_frequency=2000, # groups all variables with less than n events
                        handle_unknown='infrequent_if_exist') # avoid mistakes

# to apply one hot encoding in the dataset, we will use ColumnTransformer
transformer = ColumnTransformer(transformers = [('onehot', # transformation name
                                                  encoder,  # transformation object
                                                  ['PAYMENT_TYPE'])], # columns to apply transformation
                                 remainder = 'passthrough') # do nothing with the other columns

# apply preprocessor to datasets
X_train_encoded = transformer.fit_transform(X_train) # apply fit transform in train set
X_test_encoded = transformer.transform(X_test) # apply only transform in test set

Training is simple, we will evaluate model based on predictions in test set.

In [None]:
# Training
# sklearn LinearRegression does not need any specification of categorical variables,
# when they are one hot encoded.
model = LinearRegression() # define model
model.fit(X_train_encoded, y_train) # train model
y_pred = model.predict(X_test_encoded) # predict with the outcome model

# Results generator function
def evaluate_model(y_true, y_pred):
    metrics = {
        'MSE': mean_squared_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), # root mean squared error
        'MAE': mean_absolute_error(y_true, y_pred),
        'R²': r2_score(y_test, y_pred)}
    return metrics

# Results
training_results = evaluate_model(y_test, y_pred)
training_results

Our model gets a 95 R squared value, wich indicates excellent prediction power of the model.
Now some visualization.

In [None]:
# visualization I: objective vs. real
# this graph indicates good prediction if the dots look diagonal.
plt.scatter(y_test, y_pred, alpha=0.5) # dots
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--') # diagonal line
plt.xlabel('objective (true value)')
plt.ylabel('prediction')
plt.title('real vs. objective')
plt.show()

# visualization II: residuals
# this graph indicates good prediction if the dots look horizontal.
residuals = y_test - y_pred # residual calculation
plt.scatter(y_pred, residuals, alpha=0.5) # dots
plt.axhline(y=0, color='r', linestyle='--') # horizontal line
plt.xlabel('prediction')
plt.ylabel('residuals')
plt.title('residuals')
plt.show()

# Conclusion and Notes

Even if linnear regression is the simplest model, 
it is needed to know some characteristics of the model to get a good prediction. 
For example, we need a good feature engineer that adapts to the model.
Finally, we need understanding of the results parameters to make conclutions.
<br>
**NOTES**: For next models, I recomend comparing with different characteristics, maybe with a gridsearch.