In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [None]:
import pandas as pd

# Load the Excel file (replace with the filename you uploaded)
df = pd.read_excel("/content/drive/MyDrive/Hackathon/Didul - Random forest/updated_with_prev_production(Maha).xlsx")

df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Hackathon/Didul - Random forest/updated_with_prev_production(Maha).xlsx'

In [None]:
import numpy as np

# Step 1: Clean the 'Year' column (e.g., "1978/1979" → 1979), handling NaNs
def clean_year(value):
    if pd.isna(value):
        return np.nan
    try:
        return int(str(value).split('/')[-1])
    except (ValueError, IndexError):
        return np.nan

df['Year_cleaned'] = df['Year'].apply(clean_year)

In [None]:
# Fix Year column by extracting first year as integer
# df['Year'] = df['Year'].apply(lambda x: int(str(x).split('/')[0]))

# Encode categorical columns
df['Season_encoded'] = LabelEncoder().fit_transform(df['Season'])
df['District_encoded'] = LabelEncoder().fit_transform(df['District'])

# Define features and target
# features = ['Year', 'Season_encoded', 'District_encoded', 'Yield(kg per hect)', 'Extent Harvested(hect)', 'Sown(hect)']
features = ['Year', 'Season_encoded', 'District_encoded', 'Sown(hect)', 'Previous_Production']
target = 'Total production(mt.)'

In [None]:
# Drop rows with missing values in features or target
df_clean = df.dropna(subset=features + [target])

X = df_clean[features]
y = df_clean[target]

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Train model
!pip install lightgbm catboost

from catboost import CatBoostRegressor

model_extent = CatBoostRegressor(
    iterations=20000,
    depth=12,
    learning_rate=0.2,
    l2_leaf_reg=12,
    bagging_temperature=1,
    early_stopping_rounds=100,
    verbose=100,
    random_state=42
)

model_extent.fit(X_train, y_train)

In [None]:
import joblib
from google.colab import files

# Save models and mappings
joblib.dump(model_extent, "model_production(Maha).pkl")

# Download from Colab
files.download("model_production(Maha).pkl")

In [None]:
import joblib
import pandas as pd
from google.colab import files

# Load the saved production model
# Assuming the model_prodiction.pkl is already uploaded or saved to the Colab environment
try:
    model_production = joblib.load("/content/drive/MyDrive/Hackathon/final_model/model_production(Maha).pkl")
except FileNotFoundError:
    print("model_prodiction.pkl not found. Please ensure the model is saved and accessible.")
    # You might want to add a mechanism to upload the file here if it's not found

# Static mappings (adjust if needed)
season_map = {'Maha': 0, 'Yala': 1}
district_list = df['District'].unique().tolist()

def predict_production1(year, season, district, sown_hect, previous_production):

    # Encode inputs
    season_encoded = season_map.get(season, 0)
    try:
        district_encoded = district_list.index(district.upper())
    except ValueError:
        raise ValueError(f"District '{district}' not found in the dataset.")

    # last year extracted
    if isinstance(year, str) and "/" in year:
        last_year = int(year.split("/")[-1])
    else:
        last_year = int(year)  # If already a single year

    # Create input DataFrame with correct feature names
    input_df = pd.DataFrame([{
        'Year': last_year,
        'Season_encoded': season_encoded,
        'District_encoded': district_encoded,
        'Sown(hect)': sown_hect,
        'Previous_Production': previous_production # Use the correct feature name
    }])

    # Predict Harvested Production
    predicted_production = model_production.predict(
        input_df[['Year', 'Season_encoded', 'District_encoded', 'Sown(hect)', 'Previous_Production']] # Use the correct feature names
    )[0]

    return round(predicted_production, 2)

In [None]:
production = predict_production1(
    year=2023/2024,
    season='Maha',
    district='VAVUNIYA',
    sown_hect=23412,
    previous_production=2861
)

print(f"Predicted Harvested Extent: {production} hectares")

In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Step 1: Prepare feature set
X_extent_harvested = df[['Year', 'Season_encoded', 'District_encoded', 'Sown(hect)', 'Previous_Production']]

# Step 2: Predict production using the extent model
df['Predicted_Production'] = model_production.predict(X_extent_harvested)

# Step 3: Drop rows with NaNs in either actual or predicted columns
df_clean = df.dropna(subset=['Total production(mt.)', 'Predicted_Production'])

# Step 4: Evaluate predictions
y_true = df_clean['Total production(mt.)']
y_pred = df_clean['Predicted_Production']

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print(f"Evaluation on Cleaned Test Set:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.scatter(y_true, y_pred, alpha=0.7)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'r--')  # perfect line
plt.xlabel("Actual Production (Metric Tons)")
plt.ylabel("Predicted Production (Metric Tons)")
plt.title("Predicted vs. Actual Production")
plt.grid(True)
plt.show()