# 1. Define the Real-World Problem &
# 2. Build a Scikit-Learn Pipeline

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

# Load the dataset 
df = pd.read_csv('hour.csv')

# Drop unnecessary columns
df = df.drop(columns=['instant', 'dteday', 'casual', 'registered'])

# Target: 'cnt' (bike rental count)
X = df.drop(columns=['cnt'])
y = df['cnt']

# Numerical and categorical columns
numerical_features = ['temp', 'atemp', 'hum', 'windspeed']
categorical_features = ['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday', 'weathersit']

# Feature engineering function (create new feature: temp * hum interaction)
def feature_engineering(X):
    X = X.copy()
    X['temp_hum_interaction'] = X['temp'] * X['hum']
    return X

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combined preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Full pipeline: Preprocessing + Feature Engineering + Model
pipeline = Pipeline(steps=[
    ('feature_eng', FunctionTransformer(feature_engineering)),  # Add new feature before preprocessing
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(n_estimators=100, random_state=42))
])

print("Pipeline built successfully.")

Pipeline built successfully.


# 3. Train, Test, and Evaluate the Pipeline

In [12]:
from sklearn.metrics import mean_squared_error, r2_score

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = pipeline.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R² Score: {r2:.2f}")

Mean Squared Error (MSE): 2275.59
R² Score: 0.93
