In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostRegressor, Pool

In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
from google.colab import files
uploaded = files.upload()


Saving crop_yield.csv to crop_yield.csv


In [5]:
# Load dataset
data = pd.read_csv("crop_yield.csv")

print("Dataset Shape:", data.shape)
print(data.head())

Dataset Shape: (1000000, 10)
  Region Soil_Type     Crop  Rainfall_mm  Temperature_Celsius  \
0   West     Sandy   Cotton   897.077239            27.676966   
1  South      Clay     Rice   992.673282            18.026142   
2  North      Loam   Barley   147.998025            29.794042   
3  North     Sandy  Soybean   986.866331            16.644190   
4  South      Silt    Wheat   730.379174            31.620687   

   Fertilizer_Used  Irrigation_Used Weather_Condition  Days_to_Harvest  \
0            False             True            Cloudy              122   
1             True             True             Rainy              140   
2            False            False             Sunny              106   
3            False             True             Rainy              146   
4             True             True            Cloudy              110   

   Yield_tons_per_hectare  
0                6.555816  
1                8.527341  
2                1.127443  
3                6.5175

In [7]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 10 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   Region                  1000000 non-null  object 
 1   Soil_Type               1000000 non-null  object 
 2   Crop                    1000000 non-null  object 
 3   Rainfall_mm             1000000 non-null  float64
 4   Temperature_Celsius     1000000 non-null  float64
 5   Fertilizer_Used         1000000 non-null  bool   
 6   Irrigation_Used         1000000 non-null  bool   
 7   Weather_Condition       1000000 non-null  object 
 8   Days_to_Harvest         1000000 non-null  int64  
 9   Yield_tons_per_hectare  1000000 non-null  float64
dtypes: bool(2), float64(3), int64(1), object(4)
memory usage: 62.9+ MB
None


In [8]:
# Remove duplicate rows
duplicates = data.duplicated().sum()
print("Number of duplicate rows:", duplicates)

if duplicates > 0:
    data = data.drop_duplicates()
    print("After removing duplicates:", data.shape)

# Fill missing values
for col in data.columns:
    if data[col].dtype == 'object':  # categorical
        data[col] = data[col].fillna(data[col].mode()[0])
    else:  # numeric
        data[col] = data[col].fillna(data[col].mean())


Number of duplicate rows: 0


In [9]:
data.isnull().sum()

Unnamed: 0,0
Region,0
Soil_Type,0
Crop,0
Rainfall_mm,0
Temperature_Celsius,0
Fertilizer_Used,0
Irrigation_Used,0
Weather_Condition,0
Days_to_Harvest,0
Yield_tons_per_hectare,0


In [10]:
# Define features and target
X = data.drop('Yield_tons_per_hectare', axis=1)
y = data['Yield_tons_per_hectare']

# Categorical columns for CatBoost
cat_features = ['Region', 'Soil_Type', 'Crop',
                'Fertilizer_Used', 'Irrigation_Used',
                'Weather_Condition']

# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("\nTraining samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])



Training samples: 800000
Testing samples: 200000


In [11]:
# Create CatBoost pools for GPU optimization
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)


In [12]:
# Initialize CatBoost model for GPU
model = CatBoostRegressor(
    iterations=1000,
    depth=10,
    learning_rate=0.05,
    l2_leaf_reg=3,
    loss_function='RMSE',
    task_type='GPU',  # Train on GPU
    devices='0',      # Use first GPU
    random_seed=42,
    verbose=200
)

print("\nTraining model on GPU...")
model.fit(train_pool, eval_set=test_pool, use_best_model=True)


Training model on GPU...
0:	learn: 1.6199448	test: 1.6216745	best: 1.6216745 (0)	total: 137ms	remaining: 2m 17s
200:	learn: 0.4991659	test: 0.5009694	best: 0.5009667 (156)	total: 20.6s	remaining: 1m 21s
400:	learn: 0.4966365	test: 0.5011390	best: 0.5009667 (156)	total: 42.9s	remaining: 1m 4s
600:	learn: 0.4940676	test: 0.5013370	best: 0.5009667 (156)	total: 1m 5s	remaining: 43.2s
800:	learn: 0.4917037	test: 0.5015842	best: 0.5009667 (156)	total: 1m 27s	remaining: 21.7s
999:	learn: 0.4894220	test: 0.5018370	best: 0.5009667 (156)	total: 1m 49s	remaining: 0us
bestTest = 0.5009667412
bestIteration = 156
Shrink model to first 157 iterations.


<catboost.core.CatBoostRegressor at 0x7bd4da4ef010>

In [13]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Predict on test set
y_pred = model.predict(X_test)

# Evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\nModel Performance:")
print("RMSE:", round(rmse, 4))
print("R² Score:", round(r2, 4))



Model Performance:
RMSE: 0.501
R² Score: 0.9129


In [14]:
# Show which features impact yield most
print("\nFeature Importance:")
feature_importance = model.get_feature_importance()
for name, score in sorted(zip(X.columns, feature_importance), key=lambda x: x[1], reverse=True):
    print(f"{name}: {round(score, 2)}")



Feature Importance:
Fertilizer_Used: 50.24
Irrigation_Used: 32.14
Rainfall_mm: 15.71
Temperature_Celsius: 1.78
Region: 0.03
Days_to_Harvest: 0.03
Crop: 0.03
Weather_Condition: 0.03
Soil_Type: 0.02


In [15]:
# Save model for deployment
model.save_model("crop_yield_catboost_gpu.cbm")
print("\nModel saved as crop_yield_catboost_gpu.cbm")

# Load model
loaded_model = CatBoostRegressor()
loaded_model.load_model("crop_yield_catboost_gpu.cbm")



Model saved as crop_yield_catboost_gpu.cbm


<catboost.core.CatBoostRegressor at 0x7bd4daeb3a90>

In [16]:
import pandas as pd

sample_input = pd.DataFrame({
    'Region': ['North'],
    'Soil_Type': ['Loam'],
    'Crop': ['Wheat'],
    'Rainfall_mm': [500],
    'Temperature_Celsius': [25],
    'Fertilizer_Used': ['True'],
    'Irrigation_Used': ['True'],
    'Weather_Condition': ['Sunny'],
    'Days_to_Harvest': [120]
})

predicted_yield = loaded_model.predict(sample_input)[0]
print("\nPredicted Yield for sample input:", round(predicted_yield, 2), "tons/hectare")



Predicted Yield for sample input: 5.69 tons/hectare


In [2]:
from catboost import CatBoostRegressor

model = CatBoostRegressor()
model.load_model("crop_yield_catboost_gpu.cbm")


ModuleNotFoundError: No module named 'catboost'