In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import datetime

In [22]:
# Load the data from gas.csv
df = pd.read_csv('Reg_Gas.csv')

In [23]:
df

Unnamed: 0,Location,Month,Prices
0,"St. John's, Newfoundland and Labrador",Jan-90,57.7
1,"St. John's, Newfoundland and Labrador",Feb-90,58.2
2,"St. John's, Newfoundland and Labrador",Mar-90,58.5
3,"St. John's, Newfoundland and Labrador",Apr-90,59.3
4,"St. John's, Newfoundland and Labrador",May-90,59.8
...,...,...,...
7410,"Yellowknife, Northwest Territories",Nov-23,168.7
7411,"Yellowknife, Northwest Territories",Dec-23,163.4
7412,"Yellowknife, Northwest Territories",Jan-24,157.4
7413,"Yellowknife, Northwest Territories",Feb-24,157


In [24]:
# Convert 'Month' column to datetime format
df['Month'] = pd.to_datetime(df['Month'], format='%b-%y')


In [25]:
# Extract year and month as separate numerical features
df['Year'] = df['Month'].dt.year
df['Month_Num'] = df['Month'].dt.month

In [26]:
# Clean the data by removing rows with non-numeric values in 'prices' column
df = df[pd.to_numeric(df['Prices'], errors='coerce').notnull()]


In [27]:
# Convert 'prices' column to float
df['Prices'] = df['Prices'].astype(float)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Prices'] = df['Prices'].astype(float)


In [28]:
# Group the data by 'Location' for predictions
groups = df.groupby('Location')
df

Unnamed: 0,Location,Month,Prices,Year,Month_Num
0,"St. John's, Newfoundland and Labrador",1990-01-01,57.7,1990.0,1.0
1,"St. John's, Newfoundland and Labrador",1990-02-01,58.2,1990.0,2.0
2,"St. John's, Newfoundland and Labrador",1990-03-01,58.5,1990.0,3.0
3,"St. John's, Newfoundland and Labrador",1990-04-01,59.3,1990.0,4.0
4,"St. John's, Newfoundland and Labrador",1990-05-01,59.8,1990.0,5.0
...,...,...,...,...,...
7410,"Yellowknife, Northwest Territories",2023-11-01,168.7,2023.0,11.0
7411,"Yellowknife, Northwest Territories",2023-12-01,163.4,2023.0,12.0
7412,"Yellowknife, Northwest Territories",2024-01-01,157.4,2024.0,1.0
7413,"Yellowknife, Northwest Territories",2024-02-01,157.0,2024.0,2.0


In [29]:
# Initialize an empty DataFrame to store the predictions
predicted_df = pd.DataFrame()

In [30]:
# Iterate through each group and make predictions
for name, group in groups:
    # Splitting the data into features (X) and target variable (y)
    X = group[['Year', 'Month_Num']]
    y = group['Prices']

    # Training a linear regression model
    model = LinearRegression()
    model.fit(X, y)

    # Create a DataFrame for future dates for this location
    future_dates = pd.date_range(start=group['Month'].max(), periods=60, freq='M') + datetime.timedelta(days=30)
    future_df = pd.DataFrame({'Month': future_dates})
    future_df['Year'] = future_df['Month'].dt.year
    future_df['Month_Num'] = future_df['Month'].dt.month

    # Make predictions for this location
    future_predictions = model.predict(future_df[['Year', 'Month_Num']])

    # Add predictions to the predicted DataFrame
    future_df['Predicted Prices'] = future_predictions
    future_df['Location'] = name

    # Append the predictions for this location to the overall predicted DataFrame
    predicted_df = pd.concat([predicted_df, future_df])


In [31]:
# Round predicted prices to 1 decimal place
predicted_df['Predicted Prices'] = predicted_df['Predicted Prices'].round(1)

# Save predicted DataFrame to a CSV file
predicted_df.to_csv('Predicted_Reg_Gas_Prices.csv', index=False)

In [32]:
predicted_df

Unnamed: 0,Month,Year,Month_Num,Predicted Prices,Location
0,2024-04-30,2024,4,133.0,"Calgary, Alberta"
1,2024-05-30,2024,5,133.3,"Calgary, Alberta"
2,2024-06-30,2024,6,133.6,"Calgary, Alberta"
3,2024-07-30,2024,7,134.0,"Calgary, Alberta"
4,2024-08-30,2024,8,134.3,"Calgary, Alberta"
...,...,...,...,...,...
55,2028-11-30,2028,11,170.6,"Yellowknife, Northwest Territories"
56,2028-12-30,2028,12,171.2,"Yellowknife, Northwest Territories"
57,2029-01-30,2029,1,168.0,"Yellowknife, Northwest Territories"
58,2029-03-02,2029,3,169.1,"Yellowknife, Northwest Territories"


In [33]:
# Reset the index of the predicted DataFrame
predicted_df.reset_index(drop=True, inplace=True)


In [34]:
import hvplot.pandas
# Plotting using hvplot
plot = predicted_df.hvplot.line(x='Month', y='Predicted Prices', by='Location', xlabel='Month', ylabel='Price (Cents per Liter)', title='Regular Gas Prices Prediction', hover_cols=['Location'])

# Show the plot
plot