In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import plotly as plt
from sklearn.feature_extraction import DictVectorizer
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Functions

In [2]:
# Read data
def read_data(path:str, filename:str)-> pd.DataFrame:
    data = pd.read_parquet(path+filename+'.parquet')
    return data

# return columns_count
def return_columns_count(data:pd.DataFrame, month_name:str)->str:
    print(f'No of columns in {month_name} data are ',data.shape[1])

def compute_duration(data:pd.DataFrame, from_datetime:str, to_datetime:str)->pd.DataFrame:
    data['duration'] = (data[to_datetime] - data[from_datetime]) / pd.Timedelta(minutes=1)
    return data

def calculate_std_dev(data:pd.DataFrame, col_name:str)->int:
    print(f'Standard deviation for column {col_name} is',data[col_name].std())
    
def execute_question_2(data, from_datetime, to_datetime, col_name):
    data = compute_duration(data, from_datetime, to_datetime)
    calculate_std_dev(data, col_name)

def remove_outliers(data:pd.DataFrame, col_name:str, lower_limit:int, upper_limit:int):
    records_before = data.shape[0]
    print(records_before)
    data = data[(data[col_name]>=lower_limit) & (data[col_name]<=upper_limit)].reset_index()
    records_after = data.shape[0]
    print(records_after)
    print('Percentage of records after removing outliers: ',((records_after)/(records_before))*100)
    return data

def create_feature_matrix(df: pd.DataFrame, col_1: str, col_2: str):
    # Convert the DataFrame to a list of dictionaries
    data = df.copy()
    data_dict = data[[col_1, col_2]].astype(str).to_dict(orient='records')
    
    # Create an instance of DictVectorizer
    dict_vectorizer = DictVectorizer()
    
    # Fit and transform the data
    feature_matrix = dict_vectorizer.fit_transform(data_dict)
    
    # Get the feature names
    feature_names = dict_vectorizer.get_feature_names_out()
    
    # Get the dimensionality of the feature matrix
    dimensionality = feature_matrix.shape[1]
    print(f"\nDimensionality of the Feature Matrix: {dimensionality}")
    
    return feature_matrix, feature_names, dict_vectorizer

In [11]:
## Load Data
jan_data = read_data('/home/deepak/Documents/mlops_zoomcamp/mlops-zoomcamp/data/', 'yellow_tripdata_2023-01') 
feb_data = read_data('/home/deepak/Documents/mlops_zoomcamp/mlops-zoomcamp/data/', 'yellow_tripdata_2023-02') 

### Questions

##### Question 1: Read the data for January. How many columns are there?

In [4]:
return_columns_count(jan_data, 'Jan')


No of columns in Jan data are  19


##### Question 2: What's the standard deviation of the trips duration in January?

In [5]:
execute_question_2(jan_data, 'tpep_pickup_datetime', 'tpep_dropoff_datetime', 'duration')

Standard deviation for column duration is 42.59435124195458


##### Question 3: There are some outliers. Let's remove them and keep only the records where the duration was between 1 and 60 minutes (inclusive). What fraction of the records left after you dropped the outliers?

In [6]:
jan_data = remove_outliers(jan_data, 'duration', 1, 60)

3066766
3009173
Percentage of records after removing outliers:  98.1220282212598


##### Question 4: Let's apply one-hot encoding to the pickup and dropoff location IDs. What's the dimensionality of this matrix (number of columns)?

In [7]:
feature_matrix, feature_names, dict_vectorizer = create_feature_matrix(jan_data, 'PULocationID', 'DOLocationID')

Feature Matrix:
  (0, 43)	1.0
  (0, 325)	1.0
  (1, 148)	1.0
  (1, 456)	1.0
  (2, 149)	1.0
  (2, 461)	1.0
  (3, 227)	1.0
  (3, 299)	1.0
  (4, 237)	1.0
  (4, 266)	1.0
  (5, 38)	1.0
  (5, 325)	1.0
  (6, 45)	1.0
  (6, 409)	1.0
  (7, 108)	1.0
  (7, 304)	1.0
  (8, 147)	1.0
  (8, 328)	1.0
  (9, 6)	1.0
  (9, 303)	1.0
  (10, 225)	1.0
  (10, 404)	1.0
  (11, 178)	1.0
  (11, 494)	1.0
  (12, 45)	1.0
  :	:
  (3009160, 328)	1.0
  (3009161, 155)	1.0
  (3009161, 494)	1.0
  (3009162, 203)	1.0
  (3009162, 306)	1.0
  (3009163, 50)	1.0
  (3009163, 325)	1.0
  (3009164, 242)	1.0
  (3009164, 401)	1.0
  (3009165, 54)	1.0
  (3009165, 326)	1.0
  (3009166, 64)	1.0
  (3009166, 482)	1.0
  (3009167, 46)	1.0
  (3009167, 401)	1.0
  (3009168, 203)	1.0
  (3009168, 266)	1.0
  (3009169, 233)	1.0
  (3009169, 271)	1.0
  (3009170, 150)	1.0
  (3009170, 273)	1.0
  (3009171, 237)	1.0
  (3009171, 400)	1.0
  (3009172, 45)	1.0
  (3009172, 435)	1.0

Dimensionality of the Feature Matrix: 515


##### Question 5:  Training a model. RMSE on train

In [8]:
# Separate the target variable
target = jan_data['duration']

# Train a plain linear regression model
model = LinearRegression()
model.fit(feature_matrix, target)

# Predict on the training data
predictions = model.predict(feature_matrix)

# Calculate the RMSE
rmse = np.sqrt(mean_squared_error(target, predictions))
print(f"\nRMSE on the training data: {rmse}")


RMSE on the training data: 7.649261931416412


In [12]:
feb_data = compute_duration(feb_data, 'tpep_pickup_datetime', 'tpep_dropoff_datetime')
feb_data = remove_outliers(feb_data, 'duration', 1, 60)
feb_data_copy = feb_data.copy()
test_data_dict = feb_data_copy[['PULocationID', 'DOLocationID']].astype(str).to_dict(orient='records')
test_feature_matrix = dict_vectorizer.transform(test_data_dict)
# Predict on the test data
test_predictions = model.predict(test_feature_matrix)
test_target = feb_data['duration']
# Calculate the RMSE
test_rmse = np.sqrt(mean_squared_error(test_target, test_predictions))
print(f"\nRMSE on the Test data: {test_rmse}")


2913955
2855951
Percentage of records after removing outliers:  98.00944077722545

RMSE on the training data: 7.8118162035401735
