# Forecasting Honda Accord Sales

## 1. Data Preprocessing and Feature Engineering

In [30]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

# Load the dataset
Sales = pd.read_csv('Accord-242A-Fall24.csv')

# Log transformation for AccordSales to stabilize variance
Sales['AccordSales_log'] = np.log(Sales['AccordSales'])

# Train-test split based on the year
Sales_train = Sales[(Sales['Year'] >= 2014) & (Sales['Year'] <= 2018)]
Sales_test = Sales[(Sales['Year'] >= 2019) & (Sales['Year'] <= 2023)]

# Selected features
features = ['Unemployment', 'AccordQueries', 'CPIEnergy', 'CPIAll', 'MilesTraveled']

X_train = Sales_train[features]
y_train = Sales_train['AccordSales_log']  # Using the log-transformed target
X_test = Sales_test[features]
y_test = Sales_test['AccordSales_log']

# Add an intercept for OLS model
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)


## 2. Ridge and Lasso Regularization

In [31]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge Regression
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_scaled, y_train)
y_pred_ridge = ridge_model.predict(X_test_scaled)

# Lasso Regression
lasso_model = Lasso(alpha=0.1)
lasso_model.fit(X_train_scaled, y_train)
y_pred_lasso = lasso_model.predict(X_test_scaled)

# Performance Metrics
rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"Ridge RMSE: {rmse_ridge}, Lasso RMSE: {rmse_lasso}")


Ridge RMSE: 0.4125625706282273, Lasso RMSE: 0.5920111906072673


## 3. Cross-Validation

In [32]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Ridge model
ridge_cv_scores = cross_val_score(ridge_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
ridge_cv_rmse = np.sqrt(-ridge_cv_scores)
print(f"Ridge Cross-Validated RMSE: {ridge_cv_rmse.mean()}")

# Cross-validation for Lasso model
lasso_cv_scores = cross_val_score(lasso_model, X_train_scaled, y_train, cv=5, scoring='neg_mean_squared_error')
lasso_cv_rmse = np.sqrt(-lasso_cv_scores)
print(f"Lasso Cross-Validated RMSE: {lasso_cv_rmse.mean()}")


Ridge Cross-Validated RMSE: 0.19914316356171163
Lasso Cross-Validated RMSE: 0.20048102676819343


## 4. Advanced Models: Random Forest and Gradient Boosting

In [33]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)

# Gradient Boosting
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_pred_gb = gb_model.predict(X_test_scaled)

# Performance Metrics
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rmse_gb = np.sqrt(mean_squared_error(y_test, y_pred_gb))
print(f"Random Forest RMSE: {rmse_rf}, Gradient Boosting RMSE: {rmse_gb}")


Random Forest RMSE: 0.5858213269229974, Gradient Boosting RMSE: 0.5576983968818576


## 5. Hyperparameter Tuning with Grid Sear

In [34]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, 30]
}

# Grid Search for Random Forest
grid_search_rf = GridSearchCV(RandomForestRegressor(), param_grid_rf, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train_scaled, y_train)
best_rf_model = grid_search_rf.best_estimator_

print(f"Best Random Forest Parameters: {grid_search_rf.best_params_}")


Best Random Forest Parameters: {'max_depth': 10, 'n_estimators': 200}


## 6. Time Series Forecasting with SARIMA

In [35]:
from statsmodels.tsa.statespace.sarimax import SARIMAX

# Fit a SARIMA model for time series forecasting
sarima_model = SARIMAX(y_train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
sarima_results = sarima_model.fit()

# Predict future sales using the SARIMA model
y_pred_sarima = sarima_results.predict(start=len(y_train), end=len(y_train) + len(y_test) - 1)

# Performance Metrics
rmse_sarima = np.sqrt(mean_squared_error(y_test, y_pred_sarima))
print(f"SARIMA RMSE: {rmse_sarima}")


RUNNING THE L-BFGS-B CODE

           * * *

Machine precision = 2.220D-16
 N =            5     M =           10

At X0         0 variables are exactly at the bounds

At iterate    0    f= -5.31870D-01    |proj g|=  9.86681D-01


  warn('Non-invertible starting seasonal moving average'
 This problem is unconstrained.



At iterate    5    f= -5.43884D-01    |proj g|=  7.62342D-01

At iterate   10    f= -5.50492D-01    |proj g|=  1.79596D-01

At iterate   15    f= -5.53460D-01    |proj g|=  9.64905D-02

At iterate   20    f= -5.59983D-01    |proj g|=  7.51962D-01

At iterate   25    f= -5.67243D-01    |proj g|=  7.80823D-02

At iterate   30    f= -5.75016D-01    |proj g|=  1.95834D-02

At iterate   35    f= -5.77294D-01    |proj g|=  2.73446D-02

At iterate   40    f= -5.77848D-01    |proj g|=  4.79211D-04

           * * *

Tit   = total number of iterations
Tnf   = total number of function evaluations
Tnint = total number of segments explored during Cauchy searches
Skip  = number of BFGS updates skipped
Nact  = number of active bounds at final generalized Cauchy point
Projg = norm of the final projected gradient
F     = final function value

           * * *

   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F
    5     43     57      1     0     0   9.228D-04  -5.778D-01
  F = -0.577848699139

## 7. Model Deployment with Flask

In [36]:
!pip install flask

if __name__ == '__main__':
    app.run(debug=True, port=5001)  # 使用不同的端口，如 5001

from flask import Flask, request, jsonify

# Flask app for serving the model predictions
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json()
    features = np.array(data['features']).reshape(1, -1)
    scaled_features = scaler.transform(features)
    
    # Predict using the best model (e.g., Random Forest)
    prediction = best_rf_model.predict(scaled_features)
    return jsonify({'prediction': prediction.tolist()})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5001
Press CTRL+C to quit
 * Restarting with stat
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/srv/conda/lib/python3.11/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/srv/conda/lib/python3.11/site-packages/traitlets/config/application.py", line 1042, in launch_instance
    app.initialize(argv)
  File "/srv/conda/lib/python3.11/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/srv/conda/lib/python3.11/site-packages/ipykernel/kernelapp.py", line 665, in initialize
    self.init_so

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## 8. Evaluation Metrics: RMSE and MAE

In [None]:
# RMSE and MAE for the best-performing model (e.g., Random Forest)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
mae = mean_absolute_error(y_test, y_pred_rf)
print(f"Final Model - RMSE: {rmse}, MAE: {mae}")