In [1]:
# Import required libraries

import numpy as np
import pandas as pd

import statsmodels.api as sm

from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import seasonal_decompose

from statsmodels.tsa.stattools import pacf, grangercausalitytests, adfuller

from statsmodels.tsa import tsatools
# from dmba import regressionSummary

from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
pd.options.display.max_columns = 20 
pd.options.display.max_rows = 20
pd.options.display.max_colwidth = 80
np.set_printoptions(precision=4, suppress=True) # Displays only upto 4 decimals
%matplotlib inline
warnings.filterwarnings("ignore", category=FutureWarning) # Suppress specific FutureWarnings

In [2]:
# Load the Excel files
df_production = pd.read_excel("./monthly sugar production.xlsx")
df_prices = pd.read_excel("./Historical sugar prices_indexmundi.xlsx")
df_cpi = pd.read_excel('./RBI-CPI-MonthlyStats.xlsx', skiprows=1)

In [3]:
df_production

Unnamed: 0,Month,Year,month Production of sugar \nlakh tonnes,Sugar export by India\nmetric tonnes,Sugar Imports In India\nmeric tonnes
0,October,2023,4.36,462256.0,76719.0
1,November,2023,43.82,669226.0,133.0
2,December,2023,73.02,1586842.0,77946.0
3,January,2023,73.79,1394166.0,73412.0
4,February,2023,63.49,1166624.0,48.0
...,...,...,...,...,...
163,May,2010,4.23,,
164,June,2010,0.69,,
165,July,2010,0.55,,
166,August,2010,1.13,,


In [4]:
df_prices

Unnamed: 0,Date,Price,Change
0,2018-01-01,39.200000,-
1,2018-02-01,39.100000,-0.002551
2,2018-03-01,38.944444,-0.003978
3,2018-04-01,37.703704,-0.031859
4,2018-05-01,35.358423,-0.062203
...,...,...,...
78,2024-07-01,42.695473,-0.009028
79,2024-08-01,43.266667,0.013378
80,2024-09-01,42.675926,-0.013653
81,2024-10-01,42.884444,0.004886


In [5]:
df_cpi

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Index,Inflation (%),Index.1,Inflation (%).1,Index.2,Inflation (%).2
0,OCT-2024,A) General Index,Provisional,199.5,6.68,193.7,5.62,196.8,6.21
1,OCT-2024,A.1) Food and beverages,Provisional,206.7,9.66,214.0,9.80,209.4,9.69
2,OCT-2024,A.1.1) Cereals and products,Provisional,196.3,7.27,194.0,6.19,195.6,6.94
3,OCT-2024,A.1.2) Meat and fish,Provisional,221.6,3.02,230.5,3.41,224.7,3.17
4,OCT-2024,A.1.3) Egg,Provisional,194,4.75,198.8,5.02,195.9,4.87
...,...,...,...,...,...,...,...,...,...
3970,JAN-2013,A.6.2) Health,Final,104,,104.1,,104.0,
3971,JAN-2013,A.6.3) Transport and communication,Final,103.3,,103.2,,103.2,
3972,JAN-2013,A.6.4) Recreation and amusement,Final,103.4,,102.9,,103.1,
3973,JAN-2013,A.6.5) Education,Final,103.8,,103.5,,103.6,


In [6]:
# Rename columns for clarity and consistency
df_production.columns = [
    'Month', 
    'Year', 
    'Monthly Production (lakh tonnes)', 
    'Sugar Export (metric tonnes)', 
    'Sugar Import (metric tonnes)'
]
df_prices.columns = ['Date', 'Sugar Price', 'Price Change']

df_cpi.columns = ['Month', 'Commodity', 'Provisional/Final', 'Rural Index', 'Rural Inflation %', 'Urban Index', 'Urban Inflation %', 'Combined Index', 'Combined Inflation %']

In [7]:
# Combine 'Month' and 'Year' into a single 'Date' column in production_data
df_production['Date'] = pd.to_datetime(df_production['Month'] + ' ' + df_production['Year'].astype(str))

  df_production['Date'] = pd.to_datetime(df_production['Month'] + ' ' + df_production['Year'].astype(str))


In [8]:
# Drop the original 'Month' and 'Year' columns as they are now combined
df_production.drop(columns=['Month', 'Year'], inplace=True)

In [9]:
df_production.head(5)

Unnamed: 0,Monthly Production (lakh tonnes),Sugar Export (metric tonnes),Sugar Import (metric tonnes),Date
0,4.36,462256.0,76719.0,2023-10-01
1,43.82,669226.0,133.0,2023-11-01
2,73.02,1586842.0,77946.0,2023-12-01
3,73.79,1394166.0,73412.0,2023-01-01
4,63.49,1166624.0,48.0,2023-02-01


In [10]:
# Ensure the 'Date' column in prices_data is a datetime object
df_prices['Date'] = pd.to_datetime(df_prices['Date'])

In [11]:
df_prices.head(5)

Unnamed: 0,Date,Sugar Price,Price Change
0,2018-01-01,39.2,-
1,2018-02-01,39.1,-0.002551
2,2018-03-01,38.944444,-0.003978
3,2018-04-01,37.703704,-0.031859
4,2018-05-01,35.358423,-0.062203


In [12]:
# Merge the two datasets on the 'Date' column
data = pd.merge(df_production, df_prices, on='Date', how='outer')

# Sort the merged data by date
data.sort_values('Date', inplace=True)

# Reset the index for a clean dataset
data.reset_index(drop=True, inplace=True)

# Ensure 'Date' column is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

In [13]:
data.head(5)

Unnamed: 0,Monthly Production (lakh tonnes),Sugar Export (metric tonnes),Sugar Import (metric tonnes),Date,Sugar Price,Price Change
0,40.94,,,2010-01-01,,
1,37.51,,,2010-02-01,,
2,29.49,,,2010-03-01,,
3,14.19,,,2010-04-01,,
4,4.23,,,2010-05-01,,


In [14]:
# Drop the extra columns, as present in the raw dataset
df_cpi.drop(columns=['Rural Index', 'Rural Inflation %', 'Urban Index', 'Urban Inflation %', 'Combined Index'], inplace=True)
df_cpi

Unnamed: 0,Month,Commodity,Provisional/Final,Combined Inflation %
0,OCT-2024,A) General Index,Provisional,6.21
1,OCT-2024,A.1) Food and beverages,Provisional,9.69
2,OCT-2024,A.1.1) Cereals and products,Provisional,6.94
3,OCT-2024,A.1.2) Meat and fish,Provisional,3.17
4,OCT-2024,A.1.3) Egg,Provisional,4.87
...,...,...,...,...
3970,JAN-2013,A.6.2) Health,Final,
3971,JAN-2013,A.6.3) Transport and communication,Final,
3972,JAN-2013,A.6.4) Recreation and amusement,Final,
3973,JAN-2013,A.6.5) Education,Final,


In [15]:
# Filtering the data to only look at the Final(non-provisional) Sugar Commodity CPI numbers
df_cpi['Commodity'] = df_cpi['Commodity'].apply(str.lower)
df_cpi = df_cpi[df_cpi["Commodity"].str.contains("sugar")]
df_cpi
df_cpi = df_cpi[df_cpi["Provisional/Final"].str.lower() == 'final']
df_cpi

Unnamed: 0,Month,Commodity,Provisional/Final,Combined Inflation %
38,SEP-2024,a.1.9) sugar and confectionery,Final,3.46
66,AUG-2024,a.1.9) sugar and confectionery,Final,4.70
94,JUL-2024,a.1.9) sugar and confectionery,Final,5.22
122,JUN-2024,a.1.9) sugar and confectionery,Final,5.83
150,MAY-2024,a.1.9) sugar and confectionery,Final,5.70
...,...,...,...,...
3846,MAY-2013,a.1.9) sugar and confectionery,Final,
3874,APR-2013,a.1.9) sugar and confectionery,Final,
3902,MAR-2013,a.1.9) sugar and confectionery,Final,
3930,FEB-2013,a.1.9) sugar and confectionery,Final,


In [16]:
# Dropping the Commodity and Provisional/Final columns as they are not needed anymore
df_cpi = df_cpi.drop(columns=['Commodity', 'Provisional/Final']).dropna().reset_index(drop=True)
df_cpi

Unnamed: 0,Month,Combined Inflation %
0,SEP-2024,3.46
1,AUG-2024,4.70
2,JUL-2024,5.22
3,JUN-2024,5.83
4,MAY-2024,5.70
...,...,...
124,MAY-2014,-0.29
125,APR-2014,-1.64
126,MAR-2014,-4.41
127,FEB-2014,-5.61


In [17]:
# Converting the existing month column into the correct (python-friendly) datetime format and renaming the columns
df_cpi['Month'] = pd.to_datetime(df_cpi['Month'], format=r"%b-%Y")
df_cpi.columns = ['Date', 'CPI']
df_cpi

Unnamed: 0,Date,CPI
0,2024-09-01,3.46
1,2024-08-01,4.70
2,2024-07-01,5.22
3,2024-06-01,5.83
4,2024-05-01,5.70
...,...,...
124,2014-05-01,-0.29
125,2014-04-01,-1.64
126,2014-03-01,-4.41
127,2014-02-01,-5.61


In [18]:
# Merge the cpi data with the merged dataset of production and prices on the 'Date' column
data = pd.merge(data, df_cpi, on='Date', how='outer')

# Sort the merged data by date
data.sort_values('Date', inplace=True)

# Reset the index for a clean dataset
data.reset_index(drop=True, inplace=True)

# Ensure 'Date' column is in datetime format
data['Date'] = pd.to_datetime(data['Date'])

data.head(5)

Unnamed: 0,Monthly Production (lakh tonnes),Sugar Export (metric tonnes),Sugar Import (metric tonnes),Date,Sugar Price,Price Change,CPI
0,40.94,,,2010-01-01,,,
1,37.51,,,2010-02-01,,,
2,29.49,,,2010-03-01,,,
3,14.19,,,2010-04-01,,,
4,4.23,,,2010-05-01,,,


In [19]:
#  Check for Missing Values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
Monthly Production (lakh tonnes)    11
Sugar Export (metric tonnes)        59
Sugar Import (metric tonnes)        59
Date                                 0
Sugar Price                         96
Price Change                        96
CPI                                 50
dtype: int64


In [20]:
# Filter rows where all five columns have non-missing values
complete_data = data.dropna(subset=[
    # 'Monthly Production (lakh tonnes)', 
    # 'Sugar Export (metric tonnes)', 
    # 'Sugar Import (metric tonnes)', 
    'Sugar Price',
    # 'CPI'
])

# Display the dates where all five columns have data
dates_with_all_data = complete_data['Date']

print("Dates with all five attributes available:")
print(dates_with_all_data)

Dates with all five attributes available:
96    2018-01-01
97    2018-02-01
98    2018-03-01
99    2018-04-01
100   2018-05-01
         ...    
174   2024-07-01
175   2024-08-01
176   2024-09-01
177   2024-10-01
178   2024-11-01
Name: Date, Length: 83, dtype: datetime64[ns]


In [21]:
complete_data = complete_data.round(2).reset_index(drop=True)

In [22]:
complete_data

Unnamed: 0,Monthly Production (lakh tonnes),Sugar Export (metric tonnes),Sugar Import (metric tonnes),Date,Sugar Price,Price Change,CPI
0,67.67,97131.0,321130.0,2018-01-01,39.20,-,2.77
1,60.54,114127.0,0.0,2018-02-01,39.10,-0.002551,-0.17
2,51.64,134766.0,59110.0,2018-03-01,38.94,-0.003978,-1.61
3,28.48,159955.0,0.0,2018-04-01,37.70,-0.031859,-4.05
4,9.18,218721.0,135470.0,2018-05-01,35.36,-0.062203,-8.21
...,...,...,...,...,...,...,...
78,,,,2024-07-01,42.70,-0.009028,5.22
79,,,,2024-08-01,43.27,0.013378,4.70
80,,,,2024-09-01,42.68,-0.013653,3.46
81,,,,2024-10-01,42.88,0.004886,


In [23]:
complete_data.to_csv('final_sugar_price_forecasting_data.csv', index=False)

PermissionError: [Errno 13] Permission denied: 'final_sugar_price_forecasting_data.csv'

In [None]:
# Select features, target variable, and dates
X = complete_data[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']]
y = complete_data['Sugar Price']
dates = complete_data['Date']


# Split into training (before 2022) and testing (2022-2023) data
train_size = int(len(complete_data) * 0.9)
train_data = complete_data[:train_size]
test_data = complete_data[train_size:]

In [None]:
train_data

Unnamed: 0,Monthly Production (lakh tonnes),Sugar Export (metric tonnes),Sugar Import (metric tonnes),Date,Sugar Price,Price Change,CPI
0,67.67,97131.0,321130.0,2018-01-01,39.20,-,2.77
1,60.54,114127.0,0.0,2018-02-01,39.10,-0.002551,-0.17
2,51.64,134766.0,59110.0,2018-03-01,38.94,-0.003978,-1.61
3,28.48,159955.0,0.0,2018-04-01,37.70,-0.031859,-4.05
4,9.18,218721.0,135470.0,2018-05-01,35.36,-0.062203,-8.21
...,...,...,...,...,...,...,...
69,4.36,462256.0,76719.0,2023-10-01,43.85,0.004673,5.50
70,43.82,669226.0,133.0,2023-11-01,44.46,0.013846,6.55
71,73.02,1586842.0,77946.0,2023-12-01,44.33,-0.002959,7.22
72,,,,2024-01-01,44.18,-0.003314,7.43


In [None]:
test_data

Unnamed: 0,Monthly Production (lakh tonnes),Sugar Export (metric tonnes),Sugar Import (metric tonnes),Date,Sugar Price,Price Change,CPI
74,,,,2024-03-01,42.21,0.002753,7.25
75,,,,2024-04-01,43.08,0.020522,6.02
76,,,,2024-05-01,43.56,0.011237,5.7
77,,,,2024-06-01,43.08,-0.01101,5.83
78,,,,2024-07-01,42.7,-0.009028,5.22
79,,,,2024-08-01,43.27,0.013378,4.7
80,,,,2024-09-01,42.68,-0.013653,3.46
81,,,,2024-10-01,42.88,0.004886,
82,,,,2024-11-01,41.78,-0.025806,


Arima Workflow:
1. Run Auto Arima, get a good model through optmization
2. Further use this good model and run optmization manually using for loops for the variables in the vicinity of pdq & PDQ values, Get a better model
3. Use the better model and further fine tune paramters and get the best SARIMA model
4. Add exogenous variables and get the best SARIMAX model.

In [24]:
import pmdarima as pm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import numpy as np

# Fit auto ARIMA model on the training data
model = pm.auto_arima(train_data['Sugar Price'], seasonal=True, m=12, trace=True, error_action='ignore', suppress_warnings=True)

# Print model summary
print(model.summary())

# Forecast the sugar prices for the test period
forecast, conf_int = model.predict(n_periods=len(test_data), return_conf_int=True)

# Add the forecasted values to the test data
test_data['Forecasted Sugar Price'] = forecast

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(test_data['Sugar Price'], test_data['Forecasted Sugar Price']))
r2 = r2_score(test_data['Sugar Price'], test_data['Forecasted Sugar Price'])
mae = mean_absolute_error(test_data['Sugar Price'], test_data['Forecasted Sugar Price'])
mape = np.mean(np.abs((test_data['Sugar Price'] - test_data['Forecasted Sugar Price']) / test_data['Sugar Price'])) * 100
mad = np.mean(np.abs(test_data['Sugar Price'] - test_data['Forecasted Sugar Price']))

print(f'RMSE: {rmse}')
print(f'R-squared: {r2}')
print(f'MAE: {mae}')
print(f'MAPE: {mape}')
print(f'MAD: {mad}')

# Plot the actual vs forecasted sugar prices
plt.figure(figsize=(12, 6))
plt.plot(train_data['Date'], train_data['Sugar Price'], label='Train')
plt.plot(test_data['Date'], test_data['Sugar Price'], label='Test')
plt.plot(test_data['Date'], test_data['Forecasted Sugar Price'], label='Forecast')
plt.fill_between(test_data['Date'], conf_int[:, 0], conf_int[:, 1], color='pink', alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Sugar Price')
plt.title('Actual vs Forecasted Sugar Prices')
plt.legend()
plt.show()

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [25]:
# Define the exogenous variables
exog_train = train_data[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']]
exog_test = test_data[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']]


# Fit auto SARIMAX model on the training data with exogenous variables
auto_sarimax_model = pm.auto_arima(train_data['Sugar Price'], exogenous=exog_train, seasonal=True, m=12, trace=True, error_action='ignore', suppress_warnings=True)

# Print model summary
print(auto_sarimax_model.summary())

# Forecast the sugar prices for the test period
auto_sarimax_forecast, auto_sarimax_conf_int = auto_sarimax_model.predict(n_periods=len(test_data), exogenous=exog_test, return_conf_int=True)

# Add the forecasted values to the test data
test_data['Auto SARIMAX Forecasted Sugar Price'] = auto_sarimax_forecast

# Calculate evaluation metrics
auto_sarimax_rmse = np.sqrt(mean_squared_error(test_data['Sugar Price'], test_data['Auto SARIMAX Forecasted Sugar Price']))
auto_sarimax_r2 = r2_score(test_data['Sugar Price'], test_data['Auto SARIMAX Forecasted Sugar Price'])
auto_sarimax_mae = mean_absolute_error(test_data['Sugar Price'], test_data['Auto SARIMAX Forecasted Sugar Price'])
auto_sarimax_mape = np.mean(np.abs((test_data['Sugar Price'] - test_data['Auto SARIMAX Forecasted Sugar Price']) / test_data['Sugar Price'])) * 100
auto_sarimax_mad = np.mean(np.abs(test_data['Sugar Price'] - test_data['Auto SARIMAX Forecasted Sugar Price']))

print(f'Auto SARIMAX RMSE: {auto_sarimax_rmse}')
print(f'Auto SARIMAX R-squared: {auto_sarimax_r2}')
print(f'Auto SARIMAX MAE: {auto_sarimax_mae}')
print(f'Auto SARIMAX MAPE: {auto_sarimax_mape}')
print(f'Auto SARIMAX MAD: {auto_sarimax_mad}')

# Plot the actual vs forecasted sugar prices
plt.figure(figsize=(12, 6))
plt.plot(train_data['Date'], train_data['Sugar Price'], label='Train')
plt.plot(test_data['Date'], test_data['Sugar Price'], label='Test')
plt.plot(test_data['Date'], test_data['Auto SARIMAX Forecasted Sugar Price'], label='Auto SARIMAX Forecast')
plt.fill_between(test_data['Date'], auto_sarimax_conf_int[:, 0], auto_sarimax_conf_int[:, 1], color='pink', alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Sugar Price')
plt.title('Actual vs Auto SARIMAX Forecasted Sugar Prices')
plt.legend()
plt.show()

NameError: name 'train_data' is not defined

In [26]:
import itertools
import pmdarima as pm
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Define the ranges for the parameters
p = [0, 1, 2]
d = [0, 1]
q = [0, 1, 2]
P = [0, 1]
D = [0, 1]
Q = [0, 1]

# Define the exogenous variables
exog_train = train_data[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']]
exog_test = test_data[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']]

# Initialize a list to store the results
results = []

# Loop through all combinations of the parameters
for (p_, d_, q_, P_, D_, Q_) in itertools.product(p, d, q, P, D, Q):
    try:
        # Fit the SARIMAX model
        model = pm.ARIMA(order=(p_, d_, q_), seasonal_order=(P_, D_, Q_, 12), suppress_warnings=True)
        model.fit(train_data['Sugar Price'], exogenous=exog_train)
        
        # Forecast the sugar prices for the test period
        forecast, conf_int = model.predict(n_periods=len(test_data), exogenous=exog_test, return_conf_int=True)
        
        # Calculate evaluation metrics
        rmse = np.sqrt(mean_squared_error(test_data['Sugar Price'], forecast))
        r2 = r2_score(test_data['Sugar Price'], forecast)
        mae = mean_absolute_error(test_data['Sugar Price'], forecast)
        mape = np.mean(np.abs((test_data['Sugar Price'] - forecast) / test_data['Sugar Price'])) * 100
        mad = np.mean(np.abs(test_data['Sugar Price'] - forecast))
        
        # Store the results
        results.append({
            'order': (p_, d_, q_),
            'seasonal_order': (P_, D_, Q_),
            'rmse': rmse,
            'r2': r2,
            'mae': mae,
            'mape': mape,
            'mad': mad
        })
    except Exception as e:
        print(f"An error occurred for parameters (p,d,q,P,D,Q) = ({p_},{d_},{q_},{P_},{D_},{Q_}): {e}")

# Convert results to a DataFrame
results_df = pd.DataFrame(results)
results_df.sort_values(by='rmse', ascending=True, inplace=True)
print(results_df)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [27]:
# Extract the best model parameters
best_model_params = results_df.iloc[0]
best_order = tuple(best_model_params['order'])
best_seasonal_order = tuple(best_model_params['seasonal_order']) + (12,)

# Fit the best SARIMAX model on the training data with exogenous variables
best_sarimax_model = pm.ARIMA(order=best_order, seasonal_order=best_seasonal_order, suppress_warnings=True)
best_sarimax_model.fit(train_data['Sugar Price'], exogenous=exog_train)

# Forecast the sugar prices for the test period using the best model
best_sarimax_forecast, best_sarimax_conf_int = best_sarimax_model.predict(n_periods=len(test_data), exogenous=exog_test, return_conf_int=True)

# Add the forecasted values to the test data
test_data['Best SARIMAX Forecasted Sugar Price'] = best_sarimax_forecast

# Plot the actual vs forecasted sugar prices using the best model
plt.figure(figsize=(12, 6))
plt.plot(train_data['Date'], train_data['Sugar Price'], label='Train')
plt.plot(test_data['Date'], test_data['Sugar Price'], label='Test')
plt.plot(test_data['Date'], test_data['Best SARIMAX Forecasted Sugar Price'], label='Best SARIMAX Forecast')
plt.fill_between(test_data['Date'], best_sarimax_conf_int[:, 0], best_sarimax_conf_int[:, 1], color='pink', alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Sugar Price')
plt.title('Actual vs Best SARIMAX Forecasted Sugar Prices')
plt.legend()
plt.show()

NameError: name 'results_df' is not defined

In [28]:
# Define the order and seasonal order
order = (0, 1, 2)
seasonal_order = (2, 0, 2, 12)

# Fit the SARIMAX model on the training data with exogenous variables
sarimax_model = pm.ARIMA(order=order, seasonal_order=seasonal_order, suppress_warnings=True)
sarimax_model.fit(train_data['Sugar Price'], exogenous=exog_train)

# Forecast the sugar prices for the test period using the specified model
sarimax_forecast, sarimax_conf_int = sarimax_model.predict(n_periods=len(test_data), exogenous=False, return_conf_int=True)

# Add the forecasted values to the test data
test_data['SARIMAX Forecasted Sugar Price'] = sarimax_forecast

# Calculate evaluation metrics
sarimax_rmse = np.sqrt(mean_squared_error(test_data['Sugar Price'], test_data['SARIMAX Forecasted Sugar Price']))
sarimax_r2 = r2_score(test_data['Sugar Price'], test_data['SARIMAX Forecasted Sugar Price'])
sarimax_mae = mean_absolute_error(test_data['Sugar Price'], test_data['SARIMAX Forecasted Sugar Price'])
sarimax_mape = np.mean(np.abs((test_data['Sugar Price'] - test_data['SARIMAX Forecasted Sugar Price']) / test_data['Sugar Price'])) * 100
sarimax_mad = np.mean(np.abs(test_data['Sugar Price'] - test_data['SARIMAX Forecasted Sugar Price']))

print(f'SARIMAX RMSE: {sarimax_rmse}')
print(f'SARIMAX R-squared: {sarimax_r2}')
print(f'SARIMAX MAE: {sarimax_mae}')
print(f'SARIMAX MAPE: {sarimax_mape}')
print(f'SARIMAX MAD: {sarimax_mad}')

# Plot the actual vs forecasted sugar prices using the specified model
plt.figure(figsize=(12, 6))
plt.plot(train_data['Date'], train_data['Sugar Price'], label='Train')
plt.plot(test_data['Date'], test_data['Sugar Price'], label='Test')
plt.plot(test_data['Date'], test_data['SARIMAX Forecasted Sugar Price'], label='SARIMAX Forecast')
plt.fill_between(test_data['Date'], sarimax_conf_int[:, 0], sarimax_conf_int[:, 1], color='pink', alpha=0.3)
plt.xlabel('Date')
plt.ylabel('Sugar Price')
plt.title('Actual vs SARIMAX Forecasted Sugar Prices')
plt.legend()
plt.show()

NameError: name 'pm' is not defined

## Regression

In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Drop rows with missing values in the training and test data
train_data_clean = train_data.dropna(subset=['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI', 'Sugar Price'])
test_data_clean = test_data.dropna(subset=['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI', 'Sugar Price'])

# Define the regression model
regression_model = LinearRegression()

# Fit the model on the training data
regression_model.fit(train_data_clean[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']], train_data_clean['Sugar Price'])

# Predict on the training data
train_predictions = regression_model.predict(train_data_clean[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']])

# Predict on the test data
test_predictions = regression_model.predict(test_data_clean[['Monthly Production (lakh tonnes)', 'Sugar Export (metric tonnes)', 'Sugar Import (metric tonnes)', 'CPI']])

# Calculate evaluation metrics for the training data
train_rmse = np.sqrt(mean_squared_error(train_data_clean['Sugar Price'], train_predictions))
train_r2 = r2_score(train_data_clean['Sugar Price'], train_predictions)
train_mae = mean_absolute_error(train_data_clean['Sugar Price'], train_predictions)
train_mape = np.mean(np.abs((train_data_clean['Sugar Price'] - train_predictions) / train_data_clean['Sugar Price'])) * 100

print(f'Training RMSE: {train_rmse}')
print(f'Training R-squared: {train_r2}')
print(f'Training MAE: {train_mae}')
print(f'Training MAPE: {train_mape}')

# Calculate evaluation metrics for the test data
test_rmse = np.sqrt(mean_squared_error(test_data_clean['Sugar Price'], test_predictions))
test_r2 = r2_score(test_data_clean['Sugar Price'], test_predictions)
test_mae = mean_absolute_error(test_data_clean['Sugar Price'], test_predictions)
test_mape = np.mean(np.abs((test_data_clean['Sugar Price'] - test_predictions) / test_data_clean['Sugar Price'])) * 100

print(f'Test RMSE: {test_rmse}')
print(f'Test R-squared: {test_r2}')
print(f'Test MAE: {test_mae}')
print(f'Test MAPE: {test_mape}')

# Add the predictions to the test data
test_data_clean['Regression Forecasted Sugar Price'] = test_predictions

# Plot the actual vs forecasted sugar prices using the regression model
plt.figure(figsize=(12, 6))
plt.plot(train_data_clean['Date'], train_data_clean['Sugar Price'], label='Train')
plt.plot(test_data_clean['Date'], test_data_clean['Sugar Price'], label='Test')
plt.plot(test_data_clean['Date'], test_data_clean['Regression Forecasted Sugar Price'], label='Regression Forecast')
plt.xlabel('Date')
plt.ylabel('Sugar Price')
plt.title('Actual vs Regression Forecasted Sugar Prices')
plt.legend()
plt.show()

NameError: name 'train_data' is not defined

In [38]:
import pickle


pickle.dump(sarimax_model, open('./short-term-forecasting-sarimax-model.pkl', 'wb'))
pickle.dump(sarimax_model, open('./long-term-forecasting-sarimax-model.pkl', 'wb'))