![dvd_image](dvd_image.jpg)

A DVD rental company needs your help! They want to figure out how many days a customer will rent a DVD for based on some features and has approached you for help. They want you to try out some regression models which will help predict the number of days a customer will rent a DVD for. The company wants a model which yeilds a MSE of 3 or less on a test set. The model you make will help the company become more efficient inventory planning.

The data they provided is in the csv file `rental_info.csv`. It has the following features:
- `"rental_date"`: The date (and time) the customer rents the DVD.
- `"return_date"`: The date (and time) the customer returns the DVD.
- `"amount"`: The amount paid by the customer for renting the DVD.
- `"amount_2"`: The square of `"amount"`.
- `"rental_rate"`: The rate at which the DVD is rented for.
- `"rental_rate_2"`: The square of `"rental_rate"`.
- `"release_year"`: The year the movie being rented was released.
- `"length"`: Lenght of the movie being rented, in minuites.
- `"length_2"`: The square of `"length"`.
- `"replacement_cost"`: The amount it will cost the company to replace the DVD.
- `"special_features"`: Any special features, for example trailers/deleted scenes that the DVD also has.
- `"NC-17"`, `"PG"`, `"PG-13"`, `"R"`: These columns are dummy variables of the rating of the movie. It takes the value 1 if the move is rated as the column name and 0 otherwise. For your convinience, the reference dummy has already been dropped.

In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from itertools import product

# Scikit-learn imports
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Regression models
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

# Load dataset
data = pd.read_csv('rental_info.csv')
print("Dataset loaded successfully!")
print(f"Dataset shape: {data.shape}")
data.head()

# 1. Data Exploration

In [None]:
# Dataset overview
print("Dataset Information:")
data.info()
print("\nDataset Description:")
data.describe()

In [None]:
# Explore special features column
print("Special Features Distribution:")
print(data["special_features"].value_counts())
print(f"\nUnique special features: {data['special_features'].nunique()}")

# 2. Data Preprocessing

In [None]:
# Create target variable: rental length in days
data["rental_length_days"] = (pd.to_datetime(data["return_date"]) - pd.to_datetime(data["rental_date"])).dt.days

# Create dummy variables for special_features column
dummies = pd.DataFrame({
    "deleted_scenes": data["special_features"].apply(lambda x: 1 if "Deleted Scenes" in str(x) else 0),
    "behind_the_scenes": data["special_features"].apply(lambda x: 1 if "Behind the Scenes" in str(x) else 0),
    "commentaries": data["special_features"].apply(lambda x: 1 if "Commentaries" in str(x) else 0),
    "trailers": data["special_features"].apply(lambda x: 1 if "Trailers" in str(x) else 0),
})
data = pd.concat([data, dummies], axis=1)

print("Preprocessing completed!")
print(f"New dataset shape: {data.shape}")
print(f"\nTarget variable (rental_length_days) statistics:")
print(data["rental_length_days"].describe())
data.head()

# 3. Model Training and Evaluation

## 3.1 Data Preparation and Helper Functions

In [None]:
# Helper functions for model evaluation
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    """Train a model and return its RMSE on test set"""
    model.fit(X_train, y_train)
    return evaluate_model(model, X_test, y_test)

def evaluate_model(model, X_test, y_test):
    """Evaluate model and return RMSE"""
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    return rmse

# Prepare features and target
y = data["rental_length_days"]
X = data.drop(columns=["rental_length_days", "rental_date", "return_date", "special_features"])

# Train-test split
SEED = 9
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
y_train_std = np.std(y_train)
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Target variable standard deviation: {y_train_std:.2f}")

## 3.2 Baseline Models

In [None]:
# Train and evaluate baseline models
models = {
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(random_state=SEED),
    'Ridge Regression': Ridge(random_state=SEED),
    'Random Forest': RandomForestRegressor(random_state=SEED),
    'SVR': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor()
}

baseline_results = {}
print("Baseline Model Results (RMSE):")
print("-" * 40)
for name, model in models.items():
    rmse = train_and_evaluate(model, X_train, y_train, X_test, y_test)
    baseline_results[name] = {'RMSE': rmse}
    print(f"{name}: {rmse:.4f}")

# Find best baseline model
best_baseline = min(baseline_results.items(), key=lambda x: x[1]['RMSE'])
print(f"\nBest baseline model: {best_baseline[0]} (RMSE: {best_baseline[1]['RMSE']:.4f})")

## 3.3 Hyperparameter Tuning

### Random Forest Optimization

In [None]:
# Gradient Boosting Hyperparameter Tuning
print("Tuning Gradient Boosting hyperparameters...")
gb_param_grid = {
    "n_estimators": [100, 200, 300],
    "learning_rate": [0.01, 0.1],
    "max_depth": [3, 5, 7],
    "subsample": [0.8, 0.9, 1.0],
    "max_features": [0.8, 0.9, 1.0]
}

gb_grid = GridSearchCV(
    GradientBoostingRegressor(random_state=SEED), 
    gb_param_grid, 
    cv=5, 
    scoring="neg_mean_squared_error", 
    n_jobs=-1
)
gb_grid.fit(X_train, y_train)

gb_best_rmse = np.sqrt(-gb_grid.best_score_)
print(f"Best Gradient Boosting parameters: {gb_grid.best_params_}")
print(f"Best cross-validated RMSE: {gb_best_rmse:.4f}")
print(f"Best cross-validated MSE: {gb_best_rmse**2:.4f}")

## 3.4 Ensemble Methods

In [None]:
# Random Forest Hyperparameter Tuning
print("Tuning Random Forest hyperparameters...")
rf_param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "max_features": ["sqrt", "log2"]
}

rf_grid = GridSearchCV(
    RandomForestRegressor(random_state=SEED), 
    rf_param_grid, 
    cv=5, 
    scoring="neg_mean_squared_error", 
    n_jobs=-1
)
rf_grid.fit(X_train, y_train)

rf_best_rmse = np.sqrt(-rf_grid.best_score_)
print(f"Best Random Forest parameters: {rf_grid.best_params_}")
print(f"Best cross-validated RMSE: {rf_best_rmse:.4f}")
print(f"Best cross-validated MSE: {rf_best_rmse**2:.4f}")

### Gradient Boosting Optimization

In [None]:
# Voting Regressor with optimized models
voting_model_final = VotingRegressor(estimators=[
    ("rf", rf_grid.best_estimator_),
    ("gb", gb_grid.best_estimator_),
])

voting_rmse_final = train_and_evaluate(voting_model_final, X_train, y_train, X_test, y_test)
print(f"Final Voting Regressor Results:")
print(f"RMSE: {voting_rmse_final:.4f}")
print(f"MSE: {voting_rmse_final**2:.4f}")

# 4. Results Summary and Conclusion

In [None]:
# Final Model Comparison
print("=" * 60)
print("FINAL MODEL PERFORMANCE SUMMARY")
print("=" * 60)
print(f"Target: MSE < 3.0")
print("-" * 60)

# Test the best models on test set
final_results = {}

# Best Random Forest test performance
rf_test_rmse = evaluate_model(rf_grid.best_estimator_, X_test, y_test)
final_results['Random Forest (Tuned)'] = rf_test_rmse

# Best Gradient Boosting test performance  
gb_test_rmse = evaluate_model(gb_grid.best_estimator_, X_test, y_test)
final_results['Gradient Boosting (Tuned)'] = gb_test_rmse

# Final voting regressor test performance
final_results['Voting Regressor (Final)'] = voting_rmse_final

# Display results
for model_name, rmse in final_results.items():
    mse = rmse ** 2
    status = "✅ MEETS TARGET" if mse < 3.0 else "❌ ABOVE TARGET"
    print(f"{model_name:<30}: RMSE={rmse:.4f}, MSE={mse:.4f} {status}")

# Best model
best_model = min(final_results.items(), key=lambda x: x[1])
print(f"\n🏆 BEST MODEL: {best_model[0]}")
print(f"   Final Test RMSE: {best_model[1]:.4f}")
print(f"   Final Test MSE:  {best_model[1]**2:.4f}")

if best_model[1]**2 < 3.0:
    print("✅ SUCCESS: Model meets the company's MSE < 3.0 requirement!")
else:
    print("❌ Target not met - further optimization needed.")