### Importing libraries

In [1]:
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer

In [2]:
from sklearn.metrics import root_mean_squared_error

### Reading and Preprocessing the dataset

In [3]:
def read_dataframe(filename):

    df = pd.read_parquet(filename)

    print("Total Columns: "+str(len(df.columns)))

    df['duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
    df['duration'] = df['duration'].dt.total_seconds()/60
    
    print("Standard Deviation of duration: " + str(df['duration'].std()))

    print("Percentage of records dropped after duration is filtered: " + 
    str((len(df[(df['duration']>=1) & (df['duration']<=60)])/len(df))*100))

    df = df[(df['duration']>=1) & (df['duration']<=60)]
    
    df[['PULocationID','DOLocationID']] = df[['PULocationID','DOLocationID']].astype(str)

    return df
    

In [4]:
# Q-1, Q-2, and Q-3
df_train = read_dataframe("D:\Study\Conda Projects\mlops-zoom\data\yellow_tripdata_2023-01.parquet")

Total Columns: 19
Standard Deviation of duration: 42.59435124195458
Percentage of records dropped after duration is filtered: 98.1220282212598


In [5]:
df_val = read_dataframe("D:\Study\Conda Projects\mlops-zoom\data\yellow_tripdata_2023-02.parquet")

Total Columns: 19
Standard Deviation of duration: 42.84210176105113
Percentage of records dropped after duration is filtered: 98.00944077722545


In [6]:
categorical = ['PULocationID','DOLocationID']
target = ['duration']

### One Hot Encoding

In [7]:
dv = DictVectorizer()
train_dicts = df_train[categorical].to_dict(orient='records')
val_dicts = df_val[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
y_train = df_train[target]
y_val = df_val[target]

In [8]:
# Q-4 Ans
len(dv.feature_names_)

515

### Training the model

In [9]:
# Q-5: Training a linear regression model

lr = LinearRegression()

lr.fit(X_train,y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_pred=y_pred,y_true=y_train)

7.649261931816197

In [10]:
# Q-6: Predicting using a validation set

y_val_pred = lr.predict(X_val)

root_mean_squared_error(y_pred=y_val_pred, y_true=y_val)

7.8118186871593

### Saving the model

In [11]:
with open('D:\Study\Conda Projects\mlops-zoom\checkpoints\models\hw-1_model.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)