In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Lasso, LinearRegression
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor 

ren = pd.read_csv('rental_info.csv')
ren.isna().sum()

ren['rental_date'] = pd.to_datetime(ren['rental_date'])
ren['return_date'] = pd.to_datetime(ren['return_date'])

ren['rental_length'] = ren['return_date'] - ren['rental_date']
ren['rental_length_days'] = ren['rental_length'].dt.days

ren['deleted_scenes'] = np.where(ren['special_features'].str.contains('Deleted Scenes'), 1, 0)
ren['behind_the_scenes'] = np.where(ren['special_features'].str.contains('Behind the Scenes'),1,0)


X = ren.drop(columns=['rental_length_days', 'rental_date', 'return_date', 'special_features', 'rental_length'], axis=1)
y = ren['rental_length_days']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=9)


ls = Lasso(random_state=9, alpha=0.1)
ls.fit(X_train, y_train)
ls_coeff = ls.coef_
plt.bar(X.columns,ls_coeff)
plt.xticks(rotation=45)
plt.show()


lr = LinearRegression()
dtr = DecisionTreeRegressor()
rfr = RandomForestRegressor()


kf = KFold(n_splits=5, shuffle=True, random_state=9)

dtr_param_dist = {
    "max_depth": [None, 5, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

dtr_cv = RandomizedSearchCV(estimator=dtr,param_distributions=dtr_param_dist,
    n_iter=10,cv=kf,scoring="neg_mean_squared_error",random_state=9
)

dtr_cv.fit(X_train, y_train)
best_dtr = dtr_cv.best_estimator_



rfr_param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

rfr_cv = RandomizedSearchCV(
    estimator=rfr,param_distributions=rfr_param_dist,
    n_iter=10,cv=kf,scoring="neg_mean_squared_error",
    random_state=9,n_jobs=-1
)

rfr_cv.fit(X_train, y_train)
best_rfr = rfr_cv.best_estimator_


lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)

dtr_pred = best_dtr.predict(X_test)
rfr_pred = best_rfr.predict(X_test)



lr_mse = mean_squared_error(y_test, lr_pred)
dtr_mse = mean_squared_error(y_test, dtr_pred)
rfr_mse = mean_squared_error(y_test, rfr_pred)

print("Linear Regression MSE:", lr_mse)
print("Decision Tree MSE:", dtr_mse)
print("Random Forest MSE:", rfr_mse)


best_model = rfr
best_mse = rfr_mse