<a href="https://colab.research.google.com/github/Divya110205/Forecasting-Prices-of-Agricultural-Commodities-Using-Machine-Learning/blob/main/Price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load datasets
weather_data = pd.read_csv('/content/weather_data.csv')
price_data = pd.read_csv('/content/price_data.csv')

In [None]:
weather_data

In [None]:
price_data

In [None]:
# Convert 'Date' column in weather_data to datetime
weather_data['Date'] = pd.to_datetime(weather_data['Date'], errors='coerce')

# Convert 'Date' column in price_data to datetime
price_data['Date'] = pd.to_datetime(price_data['Date'], errors='coerce')

In [None]:
weather_data

In [None]:
price_data

Unnamed: 0,State,Date,Rice,Wheat,Wheat Flour,Gram Dal,Arhar Dal,Urad Dal,Moong Dal,Masoor Dal,...,Vanaspati,Soya Oil,Sunflower Oil,Palm Oil,Jaggery,Tea,Salt,Potato,Onion,Tomato
0,Andhra Pradesh,2023-01-01,48.75,41,43,70,110.75,117,109,99.25,...,117.25,175,166.5,123.25,53,284.25,19.5,31.5,26.75,23.75
1,Arunachal pradesh,2023-01-01,38,40,40,90,130,130,130,110,...,140,180,175,138,85,240,18,35,40,40
2,Assam,2023-01-01,35,30.86,36.17,69.5,108.5,95.55,102,92.58,...,143.67,158.08,177.3,120.2,51.92,237.09,11.33,17.75,31.42,37.83
3,Bihar,2023-01-01,36.39,29.36,35.57,67.57,109.32,106.71,101.07,86.68,...,135.22,152.39,185.57,119.83,44.21,300.39,21.43,19.39,22.86,26.04
4,Chhattisgarh,2023-01-01,31.6,30.6,35.2,72.4,102,96.4,103.8,88.8,...,126.8,145,167,130.2,44.6,316.4,20.2,23,26.6,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11622,Telangana,2023-12-31,51.7,41.5,45,75.7,162,123.5,114.6,90,...,118.9,146,117.3,97.7,55.3,304,19.6,27.2,49.9,27.5
11623,Tripura,2023-12-31,39,,39.25,85,142.33,128,111.5,103.75,...,127.67,114.25,144.5,99.5,68.75,219.75,10.5,23,43.75,53.75
11624,Uttar Pradesh,2023-12-31,37.43,26.57,32.97,78.79,148.85,115.57,111.01,94.19,...,129.16,132.3,157.99,111.45,47.33,243.56,26.88,15.97,39.49,29.16
11625,Uttarakhand,2023-12-31,37,28,34.75,88.25,136,110.25,106.5,97.25,...,134.25,133.25,146.5,115.5,51.75,264.25,28.75,15.25,39.75,30.75


In [None]:
weather_data.isnull().sum()

In [None]:
price_data.replace(r'^\s*$', pd.NA, regex=True, inplace=True)
price_data

In [None]:
price_data.isnull().sum()

In [None]:
price_data.dtypes

In [None]:
price_columns = [
    'Rice', 'Wheat', 'Wheat Flour', 'Gram Dal',
       'Arhar Dal', 'Urad Dal', 'Moong Dal', 'Masoor Dal', 'Sugar', 'Milk ',
       'Groundnut Oil ', 'Mustard Oil ', 'Vanaspati', 'Soya Oil ',
       'Sunflower Oil', 'Palm Oil ', 'Jaggery', 'Tea ', 'Salt ', 'Potato',
       'Onion', 'Tomato']

# Convert object columns to numeric, coercing errors to NaN
for column in price_columns:
    price_data[column] = pd.to_numeric(price_data[column], errors='coerce')

# Check data types after conversion
print("\nData types after conversion:")
print(price_data.dtypes)

In [None]:
# Fill missing values for numeric columns with the mean
# Select numeric columns
numeric_columns = price_data.select_dtypes(include=['float64']).columns

# Fill missing values with the mean and round to 2 decimal points
price_data[numeric_columns] = price_data[numeric_columns].fillna(price_data[numeric_columns].mean()).round(2)
price_data['Date'].fillna(method='ffill', inplace=True)

In [None]:
price_data.isnull().sum()

In [None]:
price_data

In [None]:
merged_data = pd.merge(price_data, weather_data, on='Date', how='inner')

In [None]:
merged_data

In [None]:
merged_data.set_index('Date', inplace=True)

In [None]:
# 1. Plot price trends over time for specific commodities
def plot_price_trends(commodities):
    plt.figure(figsize=(12, 6))
    for commodity in commodities:
        plt.plot(merged_data[commodity], label=commodity)
    plt.title('Price Trends Over Time')
    plt.xlabel('Date')
    plt.ylabel('Price')
    plt.legend()
    plt.grid()
    plt.show()

# Specify commodities to plot
commodities_to_plot = ['Rice', 'Wheat', 'Urad Dal']
plot_price_trends(commodities_to_plot)

In [None]:
# 2. Analyze the seasonality and volatility of prices
def plot_seasonality(commodity):
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=merged_data[commodity].resample('M').mean(), label='Monthly Average')
    plt.title(f'Seasonality of {commodity}')
    plt.xlabel('Date')
    plt.ylabel('Average Price')
    plt.grid()
    plt.show()

# Plot seasonality for a specific commodity
plot_seasonality('Rice')

In [None]:
def compare_commodity_prices(commodity):
    plt.figure(figsize=(14, 7))
    sns.boxplot(x='State', y=commodity, data=merged_data)
    plt.title(f'Comparison of {commodity} Prices Across States (Capped Outliers)')
    plt.xticks(rotation=45)
    plt.grid()
    plt.show()

# Compare prices for Rice after capping outliers
compare_commodity_prices('Rice')

In [None]:
# Feature engineering
merged_data.reset_index(inplace=True)  # Reset index to convert Date back to a column for processing
merged_data['Date'] = merged_data['Date'].astype(int) // 10**9  # Convert to timestamp
merged_data = pd.get_dummies(merged_data, columns=['State'], drop_first=True)

In [None]:
# Step 1: Feature Selection
# Get commodity from user input
price_col = input("Enter the commodity you want to predict (e.g., Onion, Potato, Rice): ")

# Ensure the input is valid by checking if the commodity is in the merged dataset's columns
if price_col not in merged_data.columns:
    raise ValueError(f"'{price_col}' is not a valid commodity. Please check the dataset.")

# Check and convert 'Date' column to datetime if necessary
if not pd.api.types.is_datetime64_any_dtype(merged_data['Date']):
    merged_data['Date'] = pd.to_datetime(merged_data['Date'], errors='coerce')  # Convert to datetime

# Adding month as a feature
merged_data['Month'] = merged_data['Date'].dt.month  # Extract month

# Define weather features
features = ['temp at 2m', 'relativehumid at 2', 'precipitation']  # Weather features

# Output the selected commodity and features for confirmation
print(f"Selected commodity: {price_col}")
print(f"Features selected: {features + ['Month']}")


In [None]:
# Step 2: Data Preparation
# Create lagged features for historical prices
for lag in range(1, 4):  # Creating lagged features for the last 3 days
    merged_data[f'{price_col}_lag_{lag}'] = merged_data[price_col].shift(lag)

# Drop rows with NaN values generated from lagging
cleaned_data = merged_data.dropna()

In [None]:
# Step 3: Split the Data
X = cleaned_data[features + [f'{price_col}_lag_{lag}' for lag in range(1, 4)] + ['Month']]
y = cleaned_data[price_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [None]:
# Step 4: Model Selection
model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
# Step 5: Training the Model
model.fit(X_train, y_train)

In [None]:
# Step 6: Model Evaluation
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')


In [None]:
# Feature importance
importance = model.feature_importances_
features_names = features + [f'{price_col}_lag_{lag}' for lag in range(1, 4)] + ['Month']
feature_importance_df = pd.DataFrame({'Feature': features_names, 'Importance': importance})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.show()


In [None]:
# Plot actual vs predicted prices
plt.figure(figsize=(14, 7))
plt.plot(y_test.reset_index(drop=True), label='Actual Prices', color='blue')
plt.plot(y_pred, label='Predicted Prices', color='orange')
plt.title(f'Actual vs Predicted Prices for {price_col}')
plt.xlabel('Sample')
plt.ylabel('Price')
plt.legend()
plt.grid()
plt.show()


In [None]:
2# Step 1: Get User Input
state = input("Enter the state (e.g., Andhra Pradesh, Maharashtra): ")
price_col = input("Enter the commodity you want to predict (e.g., Onion, Potato, Rice): ")
future_date = input("Enter the future date (YYYY-MM-DD): ")

# Convert future_date to datetime
future_date = pd.to_datetime(future_date, errors='coerce')
if pd.isna(future_date):
    raise ValueError("Invalid date format. Please enter a date in YYYY-MM-DD format.")

# Create a DataFrame for the input features
input_data = pd.DataFrame({
    'Date': [future_date],
    'Month': [future_date.month],  # Directly add month
})

# Add one-hot encoding for the selected state
state_column_name = f'State_{state}'
if state_column_name not in cleaned_data.columns:
    raise ValueError(f"State '{state}' is not present in the dataset.")

# Initialize the state column and others
input_data[state_column_name] = 1  # Set the selected state to 1
for col in cleaned_data.columns:
    if col.startswith('State_') and col != state_column_name:
        input_data[col] = 0  # Set other states to 0

# Create lagged features for the last 3 days
# Fetch the last prices for the selected state and commodity
last_prices = cleaned_data[(cleaned_data['Date'] < future_date) &
                           (cleaned_data[state_column_name] == 1)][[price_col]].tail(3)

if len(last_prices) < 3:
    raise ValueError(f"Not enough historical data available for {state} and {price_col}.")

# Prepare lagged features
for lag in range(3):
    input_data[f'{price_col}_lag_{lag + 1}'] = last_prices.iloc[lag].values[0]

# Align columns with the model's training data
input_data = input_data.reindex(columns=X_train.columns, fill_value=0)

# Step 3: Make the Prediction
predicted_price = model.predict(input_data)

print(f"The predicted price for {price_col} in {state} on {future_date.date()} is: {predicted_price[0]:.2f}")



In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Define a threshold for classification
threshold = 5  # Adjust based on your tolerance for price difference

# Classify predictions
y_test_classes = []
y_pred_classes = []

for actual, predicted in zip(y_test, y_pred):
    if abs(actual - predicted) <= threshold:
        y_test_classes.append(1)  # True Positive
        y_pred_classes.append(1)  # Correct prediction
    elif predicted > actual + threshold:
        y_test_classes.append(0)  # False Positive
        y_pred_classes.append(1)  # Overestimation
    else:
        y_test_classes.append(0)  # False Negative
        y_pred_classes.append(0)  # Underestimation

# Calculate metrics
accuracy = accuracy_score(y_test_classes, y_pred_classes)
precision = precision_score(y_test_classes, y_pred_classes)
recall = recall_score(y_test_classes, y_pred_classes)

# Print metrics
print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')


In [None]:
# List of metrics
metrics = ['Accuracy', 'Precision', 'Recall']
values = [accuracy, precision, recall]

# Create the bar plot
plt.figure(figsize=(8, 5))
plt.bar(metrics, values, color=['blue', 'orange', 'green'])
plt.ylim(0, 1)  # Set y-axis limit from 0 to 1
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.axhline(y=0.5, color='red', linestyle='--', label='Baseline (0.5)')
plt.legend()
plt.show()