In [1]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
import xgboost as xgb
!pip install lightgbm catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [2]:
# Upload your cleaned Excel file here (same as you downloaded before)
from google.colab import files
uploaded = files.upload()

# Load it into a DataFrame
df = pd.read_excel(next(iter(uploaded)))
df.head()

Saving cleaned_rice_production_for_colab.xlsx to cleaned_rice_production_for_colab.xlsx


Unnamed: 0,Year,District,Sown(hect),Extent Harvested(hect),Yield(kg per hect),Total production(mt.),Season,Season_encoded,District_encoded
0,1979,COLOMBO,3634.0,3550,2026,6100.0,Yala,1,4
1,1979,GAMPAHA,6890.0,6744,2228,12800.0,Yala,1,6
2,1979,KALUTARA,16690.0,16372,1690,25600.0,Yala,1,9
3,1979,KANDY,15371.0,15155,2795,33500.0,Yala,1,5
4,1979,MATALE,5941.0,5838,2707,14200.0,Yala,1,16


In [3]:
# Encode categorical variables
season_encoder = LabelEncoder()
district_encoder = LabelEncoder()
df['Season_encoded'] = season_encoder.fit_transform(df['Season'])
df['District_encoded'] = district_encoder.fit_transform(df['District'])

# Save mappings for reference
season_mapping = dict(zip(season_encoder.classes_, season_encoder.transform(season_encoder.classes_)))
district_mapping = dict(zip(district_encoder.classes_, district_encoder.transform(district_encoder.classes_)))

# Clean missing values
required_cols = ['Sown(hect)', 'Extent Harvested(hect)', 'Yield(kg per hect)']
df_clean = df.dropna(subset=['Year', 'Season_encoded', 'District_encoded'] + required_cols)

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define input features and targets
features = ['Year', 'Season_encoded', 'District_encoded', 'Sown(hect)']

def select_param(target_column):

    # 1. Create the base model
    rf = RandomForestClassifier(random_state=42)

    # 2. Define the parameter grid
    param_grid = {
        'n_estimators': [100, 200],          # number of trees
        'max_depth': [None, 10, 20],         # tree depth
        'min_samples_split': [2, 5],         # min samples to split an internal node
        'min_samples_leaf': [1, 2],          # min samples at a leaf node
        'max_features': ['sqrt', 'log2'],    # number of features to consider when looking for best split
    }

    # 3. Setup GridSearchCV
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=5,                     # 5-fold cross-validation
        scoring='accuracy',       # or use 'f1', 'roc_auc', etc.
        n_jobs=-1,                # use all processors
        verbose=2
    )

    X = df_clean[features]
    y = df_clean[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Fit to training data
    grid_search.fit(X_train, y_train)

    # 5. Evaluate on test data
    best_rf = grid_search.best_estimator_
    y_pred = best_rf.predict(X_test)

    print("Best Parameters:", grid_search.best_params_)
    print("Classification Report:\n", classification_report(y_test, y_pred))

select_param('Extent Harvested(hect)')
select_param('Yield(kg per hect)')

Fitting 5 folds for each of 48 candidates, totalling 240 fits




KeyboardInterrupt: 

In [5]:
from catboost import CatBoostRegressor

def train_model(target_column):
    X = df_clean[features]
    y = df_clean[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    model = CatBoostRegressor(
        iterations=100000,       # high upper limit
        early_stopping_rounds=100,  # stop if no improvement for 100 rounds
        verbose=100,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Predict and evaluate
    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    print(f"📊 Evaluation for {target_column}")
    print(f"R² Score: {r2:.4f}")
    print(f"MAE     : {mae:,.2f}")
    print(f"MSE     : {mse:,.2f}")
    print("=" * 40)

    return model

# Train 2 models
model_extent = train_model('Extent Harvested(hect)')
model_yield = train_model('Yield(kg per hect)')

Learning rate set to 0.000949
0:	learn: 14885.4929592	total: 47.2ms	remaining: 1h 18m 36s
100:	learn: 13755.1686590	total: 112ms	remaining: 1m 50s
200:	learn: 12722.1298117	total: 182ms	remaining: 1m 30s
300:	learn: 11776.2107223	total: 252ms	remaining: 1m 23s
400:	learn: 10898.2276449	total: 317ms	remaining: 1m 18s
500:	learn: 10096.3862284	total: 385ms	remaining: 1m 16s
600:	learn: 9364.8017188	total: 453ms	remaining: 1m 14s
700:	learn: 8689.4900858	total: 519ms	remaining: 1m 13s
800:	learn: 8072.8174560	total: 586ms	remaining: 1m 12s
900:	learn: 7500.4559775	total: 652ms	remaining: 1m 11s
1000:	learn: 6973.0256587	total: 715ms	remaining: 1m 10s
1100:	learn: 6492.1870177	total: 779ms	remaining: 1m 9s
1200:	learn: 6053.3497228	total: 845ms	remaining: 1m 9s
1300:	learn: 5647.2676132	total: 912ms	remaining: 1m 9s
1400:	learn: 5279.7201625	total: 996ms	remaining: 1m 10s
1500:	learn: 4945.3595260	total: 1.06s	remaining: 1m 9s
1600:	learn: 4636.6209886	total: 1.13s	remaining: 1m 9s
1700:	l

In [6]:
# Save models and mappings
joblib.dump(model_extent, "model_extent.pkl")
joblib.dump(model_yield, "model_yield.pkl")

# Save encodings
# pd.DataFrame.from_dict(district_mapping, orient='index', columns=['District_encoded']).to_csv("district_encoding.csv")
# pd.DataFrame.from_dict(season_mapping, orient='index', columns=['Season_encoded']).to_csv("season_encoding.csv")

# Download from Colab
files.download("model_extent.pkl")
files.download("model_yield.pkl")
#files.download("district_encoding.csv")
#files.download("season_encoding.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
# Example: predict rice production for 2025 Yala season in Polonnaruwa
example_input = pd.DataFrame([{
    'Year': 2023,
    'Season_encoded': 1,  # Assume 1 = Yala
    'District_encoded': df['District'].unique().tolist().index('COLOMBO'),
    'Sown(hect)': 2218
}])

prediction_extent = model_extent.predict(example_input)[0]
prediction_yield = model_yield.predict(example_input)[0]
print(f" Predicted Production for 2023 (COLOMBO - Yala): {int(prediction_extent):,} extent_hect")
print(f" Predicted Production for 2023 (COLOMBO - Yala): {int(prediction_yield):,} yield kg")

 Predicted Production for 2023 (COLOMBO - Yala): 1,984 extent_hect
 Predicted Production for 2023 (COLOMBO - Yala): 4,934 yield kg
