In [1]:
import os
import rasterio
from tqdm import tqdm
from joblib import Parallel, delayed
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from shapely import wkt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import f1_score


In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,CTransp,IrriType,IrriSource,IrriCount,WaterCov,ExpYield,Season,geometry,dataset,tif_path
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,25-11-2023,14-04-2024,97,54,...,Transplanting,Flood,Groundwater,4,87,17,Rabi,POLYGON ((78.18079255482755 17.978971746424413...,train,/kaggle/input/sentineltimeseriesdata/SentinelT...
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,13-11-2023,26-04-2024,82,58,...,Transplanting,Flood,Canal,5,94,15,Rabi,POLYGON ((78.17483419891283 17.981508840879556...,train,/kaggle/input/sentineltimeseriesdata/SentinelT...
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,19-12-2023,28-04-2024,92,91,...,Transplanting,Flood,Canal,3,99,20,Rabi,POLYGON ((78.16888476438905 17.976727511659835...,train,/kaggle/input/sentineltimeseriesdata/SentinelT...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11791 entries, 0 to 11790
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   FarmID           11791 non-null  int64 
 1   category         8775 non-null   object
 2   Crop             11791 non-null  object
 3   State            11791 non-null  object
 4   District         11791 non-null  object
 5   Sub-District     11791 non-null  object
 6   SDate            11791 non-null  object
 7   HDate            11791 non-null  object
 8   CropCoveredArea  11791 non-null  int64 
 9   CHeight          11791 non-null  int64 
 10  CNext            11791 non-null  object
 11  CLast            11791 non-null  object
 12  CTransp          11791 non-null  object
 13  IrriType         11791 non-null  object
 14  IrriSource       11791 non-null  object
 15  IrriCount        11791 non-null  int64 
 16  WaterCov         11791 non-null  int64 
 17  ExpYield         11791 non-null

### **CONVERT _SDate_ AND _HDate_ TO DATETIME**

In [5]:
data['SDate'] = pd.to_datetime(data['SDate'], dayfirst=True)
data['HDate'] = pd.to_datetime(data['HDate'], dayfirst=True)

print(data['SDate'].dtype)
print(data['HDate'].dtype)

datetime64[ns]
datetime64[ns]


### **SUBSET DATA TO ONLY HAVE VALUES WITH _SDate_ LOWER THAN _HDate_**

In [6]:
data = data.query('SDate < HDate')

In [8]:
data = data.fillna(method='ffill')

  data = data.fillna(method='ffill')


### **EXTRACTING FEATURES FROM *tif_path***

In [9]:
# Define the local directory where the TIFF files are stored
local_tif_directory = 'C:/Users/Aduragbemi/Documents/TCHC/tif_file/'

# Update the 'tif_path' column to use local paths
data['tif_path'] = data['tif_path'].apply(lambda x: os.path.join(local_tif_directory, os.path.basename(x)) if pd.notnull(x) else None)

In [10]:
# Example feature calculation functions
def calculate_ndvi(nir_band, red_band):
    """Calculate NDVI (Normalized Difference Vegetation Index)."""
    ndvi = (nir_band - red_band) / (nir_band + red_band)
    return np.nanmean(ndvi)

def calculate_evi(nir_band, red_band, blue_band):
    """Calculate EVI (Enhanced Vegetation Index)."""
    evi = 2.5 * (nir_band - red_band) / (nir_band + 6 * red_band - 7.5 * blue_band + 1)
    return np.nanmean(evi)

def calculate_ndwi(nir_band, green_band):
    """Calculate NDWI (Normalized Difference Water Index)."""
    ndwi = (green_band - nir_band) / (green_band + nir_band)
    return np.nanmean(ndwi)

def calculate_gndvi(nir_band, green_band):
    """Calculate GNDVI (Green Normalized Difference Vegetation Index)."""
    gndvi = (nir_band - green_band) / (nir_band + green_band)
    return np.nanmean(gndvi)

def calculate_savi(nir_band, red_band, L=0.5):
    """Calculate SAVI (Soil Adjusted Vegetation Index)."""
    savi = ((nir_band - red_band) / (nir_band + red_band + L)) * (1 + L)
    return np.nanmean(savi)

def calculate_msavi(nir_band, red_band):
    """Calculate MSAVI (Modified Soil Adjusted Vegetation Index)."""
    msavi = (2 * nir_band + 1 - np.sqrt((2 * nir_band + 1)**2 - 8 * (nir_band - red_band))) / 2
    return np.nanmean(msavi)

def calculate_area(src, nir_band):
    """Calculate the area of farmland using the NIR band and pixel resolution."""
    # Get the transform (affine transformation matrix)
    transform = src.transform
    # Get the CRS (coordinate reference system)
    crs = src.crs
    # Get the pixel resolution (in meters or degrees)
    pixel_size_x = transform.a  # Width of a pixel in the x-direction
    pixel_size_y = -transform.e  # Height of a pixel in the y-direction (negative due to rasterio's coordinate system)

    # Calculate the area of a single pixel
    if crs.is_geographic:  # CRS is in degrees (e.g., WGS84)
        bounds = src.bounds
        center_lat = (bounds.top + bounds.bottom) / 2
        pixel_size_x_meters = pixel_size_x * 111320 * np.cos(np.radians(center_lat))
        pixel_size_y_meters = pixel_size_y * 111320
        pixel_area = pixel_size_x_meters * pixel_size_y_meters
    else:  # CRS is in meters (e.g., UTM)
        pixel_area = pixel_size_x * pixel_size_y

    # Create a mask for farmland (e.g., NDVI > 0.3)
    farmland_mask = nir_band > 0.3  # Adjust the threshold as needed

    # Count the number of farmland pixels
    farmland_pixels = np.sum(farmland_mask)

    # Calculate the total area of the farmland
    total_area = farmland_pixels * pixel_area

    return total_area

In [11]:
# Function to calculate features for each row in the data
def process_row_for_features(index, row):
    features = {'index': index}

    # Retrieve the TIFF file path and check if it's valid
    tif_path = row['tif_path']
    if tif_path is None:
        print(f"Skipping entry due to missing tif_path for index {index}")
        # Add NaN for all features if path is missing
        features.update({
            'ndvi': np.nan,
            'evi': np.nan,
            'ndwi': np.nan,
            'gndvi': np.nan,
            'savi': np.nan,
            'msavi': np.nan,
            'area': np.nan
        })
        return features

    # Open the TIFF file and read bands for feature calculation
    with rasterio.open(tif_path) as src:
        red = src.read(3)    # B4 for Red
        green = src.read(2)  # B3 for Green
        blue = src.read(1)   # B2 for Blue
        nir = src.read(4)    # B8 for NIR

        # Calculate each feature
        features['ndvi'] = calculate_ndvi(nir, red)
        features['evi'] = calculate_evi(nir, red, blue)
        features['ndwi'] = calculate_ndwi(nir, green)
        features['gndvi'] = calculate_gndvi(nir, green)
        features['savi'] = calculate_savi(nir, red)
        features['msavi'] = calculate_msavi(nir, red)
        features['area'] = calculate_area(src, nir)

    return features

In [12]:
new_features = Parallel(n_jobs=-1)(delayed(process_row_for_features)(index, row)
                                   for index, row in tqdm(data.iterrows(), total=len(data)))

100%|██████████| 10606/10606 [01:39<00:00, 106.10it/s]


In [13]:
new_features_df = pd.DataFrame(new_features).set_index('index')
data = data.join(new_features_df)

data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,geometry,dataset,tif_path,ndvi,evi,ndwi,gndvi,savi,msavi,area
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25,2024-04-14,97,54,...,POLYGON ((78.18079255482755 17.978971746424413...,train,C:/Users/Aduragbemi/Documents/TCHC/tif_file/20...,0.100756,-0.793684,8.488031,0.127153,0.151125,4232.596191,27108.609824
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13,2024-04-26,82,58,...,POLYGON ((78.17483419891283 17.981508840879556...,train,C:/Users/Aduragbemi/Documents/TCHC/tif_file/20...,0.18809,0.564248,11.493762,0.187815,0.28211,3249.392822,24730.359593
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19,2024-04-28,92,91,...,POLYGON ((78.16888476438905 17.976727511659835...,train,C:/Users/Aduragbemi/Documents/TCHC/tif_file/20...,0.206596,-1.456745,10.166884,0.206553,0.309869,3741.956055,15694.697856


### **CONVERT AREA TO HECTARES**

In [14]:
data['AreaHectares'] = data['area'] / 10000

data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,dataset,tif_path,ndvi,evi,ndwi,gndvi,savi,msavi,area,AreaHectares
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25,2024-04-14,97,54,...,train,C:/Users/Aduragbemi/Documents/TCHC/tif_file/20...,0.100756,-0.793684,8.488031,0.127153,0.151125,4232.596191,27108.609824,2.710861
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13,2024-04-26,82,58,...,train,C:/Users/Aduragbemi/Documents/TCHC/tif_file/20...,0.18809,0.564248,11.493762,0.187815,0.28211,3249.392822,24730.359593,2.473036
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19,2024-04-28,92,91,...,train,C:/Users/Aduragbemi/Documents/TCHC/tif_file/20...,0.206596,-1.456745,10.166884,0.206553,0.309869,3741.956055,15694.697856,1.56947


### **EXTRACT *month* AND *week_of_month* FROM *SDate* AND *HDate***

In [17]:
def week_of_month(dt):
    """Calculate the week of the month for a given datetime object."""
    first_day = dt.replace(day=1)
    dom = dt.day  # Day of the month
    adjusted_dom = dom + first_day.weekday()  # Adjust for the first week's weekday
    return (adjusted_dom - 1) // 7 + 1

In [18]:
data['SMonth'] = data['SDate'].dt.month
data['SWOM'] = data['SDate'].apply(week_of_month)
data['SYear'] = data['SDate'].dt.year

data['HMonth'] = data['HDate'].dt.month
data['HWOM'] = data['HDate'].apply(week_of_month)
data['HYear'] = data['HDate'].dt.year

data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,savi,msavi,area,AreaHectares,SMonth,SWOM,SYear,HMonth,HWOM,HYear
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25,2024-04-14,97,54,...,0.151125,4232.596191,27108.609824,2.710861,11,4,2023,4,2,2024
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13,2024-04-26,82,58,...,0.28211,3249.392822,24730.359593,2.473036,11,3,2023,4,4,2024
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19,2024-04-28,92,91,...,0.309869,3741.956055,15694.697856,1.56947,12,4,2023,4,4,2024


### **EXTRACTING FEATURES FROM _geometry_**

In [19]:
data['geometry'] = data['geometry'].astype(str).apply(wkt.loads)
data['Perimeter'] = data['geometry'].apply(lambda x: x.length if x else None)   

# Calculate centroid
data['Centroid'] = data['geometry'].apply(lambda x: x.centroid if x else None)

# Optionally split centroid into latitude and longitude
data['Centroid_Lat'] = data['Centroid'].apply(lambda x: x.y if x else None)
data['Centroid_Lon'] = data['Centroid'].apply(lambda x: x.x if x else None)

### **GROWING SEASON DURATION**

In [20]:
data['SeasonDuration'] = (data['HDate'] - data['SDate']).dt.days

data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,SWOM,SYear,HMonth,HWOM,HYear,Perimeter,Centroid,Centroid_Lat,Centroid_Lon,SeasonDuration
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25,2024-04-14,97,54,...,4,2023,4,2,2024,0.004244,POINT (78.18159243724418 17.978862817857717),17.978863,78.181592,141
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13,2024-04-26,82,58,...,3,2023,4,4,2024,0.005078,POINT (78.17560616961495 17.980965846626752),17.980966,78.175606,165
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19,2024-04-28,92,91,...,4,2023,4,4,2024,0.003992,POINT (78.16920661853761 17.97608418454474),17.976084,78.169207,131


### **CROP IRRIGATION TYPE METHOD**

In [21]:
data['CropIrriType'] = data['Crop'].astype(str) + "_" + data['IrriType'].astype(str)

data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,SYear,HMonth,HWOM,HYear,Perimeter,Centroid,Centroid_Lat,Centroid_Lon,SeasonDuration,CropIrriType
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25,2024-04-14,97,54,...,2023,4,2,2024,0.004244,POINT (78.18159243724418 17.978862817857717),17.978863,78.181592,141,Paddy_Flood
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13,2024-04-26,82,58,...,2023,4,4,2024,0.005078,POINT (78.17560616961495 17.980965846626752),17.980966,78.175606,165,Paddy_Flood
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19,2024-04-28,92,91,...,2023,4,4,2024,0.003992,POINT (78.16920661853761 17.97608418454474),17.976084,78.169207,131,Paddy_Flood


### **CROP IRRIGATION SOURCE METHOD**

In [22]:
data['CropIrriSource'] = data['Crop'].astype(str) + "_" + data['IrriSource'].astype(str)

data.head(3)

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,HMonth,HWOM,HYear,Perimeter,Centroid,Centroid_Lat,Centroid_Lon,SeasonDuration,CropIrriType,CropIrriSource
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25,2024-04-14,97,54,...,4,2,2024,0.004244,POINT (78.18159243724418 17.978862817857717),17.978863,78.181592,141,Paddy_Flood,Paddy_Groundwater
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13,2024-04-26,82,58,...,4,4,2024,0.005078,POINT (78.17560616961495 17.980965846626752),17.980966,78.175606,165,Paddy_Flood,Paddy_Canal
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19,2024-04-28,92,91,...,4,4,2024,0.003992,POINT (78.16920661853761 17.97608418454474),17.976084,78.169207,131,Paddy_Flood,Paddy_Canal


### **DROP FEATURES**

In [23]:
data = data.drop(['State', 'geometry', 'tif_path', 'area', 'SDate', 'HDate', 'Centroid'], 
                 axis=1)
data.head(3)

Unnamed: 0,FarmID,category,Crop,District,Sub-District,CropCoveredArea,CHeight,CNext,CLast,CTransp,...,SYear,HMonth,HWOM,HYear,Perimeter,Centroid_Lat,Centroid_Lon,SeasonDuration,CropIrriType,CropIrriSource
0,1326576,Healthy,Paddy,Medak,Kulcharam,97,54,Pea,Lentil,Transplanting,...,2023,4,2,2024,0.004244,17.978863,78.181592,141,Paddy_Flood,Paddy_Groundwater
1,1326577,Healthy,Paddy,Medak,Kulcharam,82,58,Pea,Lentil,Transplanting,...,2023,4,4,2024,0.005078,17.980966,78.175606,165,Paddy_Flood,Paddy_Canal
2,1326578,Healthy,Paddy,Medak,Kulcharam,92,91,Pea,Lentil,Transplanting,...,2023,4,4,2024,0.003992,17.976084,78.169207,131,Paddy_Flood,Paddy_Canal


### **SPLIT DATA INTO TEST AND TRAIN**

In [24]:
train = data.query('dataset == "train"')

test = data.query('dataset == "test"')

In [25]:
train = train.drop('dataset', axis=1)

test = test.drop(['category', 'dataset'], axis=1)

In [26]:
train.head(3)

Unnamed: 0,FarmID,category,Crop,District,Sub-District,CropCoveredArea,CHeight,CNext,CLast,CTransp,...,SYear,HMonth,HWOM,HYear,Perimeter,Centroid_Lat,Centroid_Lon,SeasonDuration,CropIrriType,CropIrriSource
0,1326576,Healthy,Paddy,Medak,Kulcharam,97,54,Pea,Lentil,Transplanting,...,2023,4,2,2024,0.004244,17.978863,78.181592,141,Paddy_Flood,Paddy_Groundwater
1,1326577,Healthy,Paddy,Medak,Kulcharam,82,58,Pea,Lentil,Transplanting,...,2023,4,4,2024,0.005078,17.980966,78.175606,165,Paddy_Flood,Paddy_Canal
2,1326578,Healthy,Paddy,Medak,Kulcharam,92,91,Pea,Lentil,Transplanting,...,2023,4,4,2024,0.003992,17.976084,78.169207,131,Paddy_Flood,Paddy_Canal


In [27]:
test.head(3)

Unnamed: 0,FarmID,Crop,District,Sub-District,CropCoveredArea,CHeight,CNext,CLast,CTransp,IrriType,...,SYear,HMonth,HWOM,HYear,Perimeter,Centroid_Lat,Centroid_Lon,SeasonDuration,CropIrriType,CropIrriSource
8775,85197,Paddy,Medak,Nizampet,81,99,Lentil,Pea,Transplanting,Flood,...,2023,5,3,2024,0.006766,18.062054,78.547474,184,Paddy_Flood,Paddy_Groundwater
8776,779677,Paddy,Medak,Nizampet,91,63,Lentil,Pea,Transplanting,Flood,...,2023,5,5,2024,0.010785,18.067696,78.548407,160,Paddy_Flood,Paddy_Canal
8777,1331840,Paddy,Medak,Nizampet,94,89,Pea,Lentil,Transplanting,Flood,...,2023,5,3,2024,0.005455,18.07948,78.560171,171,Paddy_Flood,Paddy_Canal


### **FILL MISSING DATA**

In [28]:
train = train.fillna({'ndvi': train['ndvi'].mean(), 
                      'ndwi': train['ndwi'].mean(), 
                      'gndvi': train['gndvi'].mean()})

test = test.fillna({'ndvi': train['ndvi'].mean(), 
                    'ndwi': train['ndwi'].mean(), 
                    'gndvi': train['gndvi'].mean()})

### **REPLACE INFINITY VALUE WITH MEAN**

In [29]:
train['evi'].describe()

  sqr = _ensure_numeric((avg - values) ** 2)


count    7888.000000
mean             inf
std              NaN
min     -1479.291647
25%        -0.966117
50%         0.696779
75%         2.990868
max              inf
Name: evi, dtype: float64

In [30]:
train = train.replace([np.inf, -np.inf], 0.696779)

In [31]:
test['evi'].describe()

  sqr = _ensure_numeric((avg - values) ** 2)


count    2718.000000
mean             inf
std              NaN
min      -237.955184
25%        -1.082345
50%         0.353147
75%         1.941904
max              inf
Name: evi, dtype: float64

In [32]:
test = test.replace([np.inf, -np.inf], 0.355370)

### **SET _FarmID_ AS THE INDEX**

In [33]:
train.set_index('FarmID', inplace=True)

test.set_index('FarmID', inplace=True)

### **CHANGE DATA TYPE OF NON-NUMERIC FEATURE TO _Category_**

In [34]:
train_cat_cols = train.select_dtypes(include='object').columns

for col in train_cat_cols:
    train[col] = train[col].astype('category')

train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7888 entries, 1326576 to 1330504
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   category         7888 non-null   category
 1   Crop             7888 non-null   category
 2   District         7888 non-null   category
 3   Sub-District     7888 non-null   category
 4   CropCoveredArea  7888 non-null   int64   
 5   CHeight          7888 non-null   int64   
 6   CNext            7888 non-null   category
 7   CLast            7888 non-null   category
 8   CTransp          7888 non-null   category
 9   IrriType         7888 non-null   category
 10  IrriSource       7888 non-null   category
 11  IrriCount        7888 non-null   int64   
 12  WaterCov         7888 non-null   int64   
 13  ExpYield         7888 non-null   int64   
 14  Season           7888 non-null   category
 15  ndvi             7888 non-null   float64 
 16  evi              7888 non-null   float

In [35]:
test_cat_cols = test.select_dtypes(include='object').columns

for col in test_cat_cols:
    test[col] = test[col].astype('category')

test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2718 entries, 85197 to 1326552
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   Crop             2718 non-null   category
 1   District         2718 non-null   category
 2   Sub-District     2718 non-null   category
 3   CropCoveredArea  2718 non-null   int64   
 4   CHeight          2718 non-null   int64   
 5   CNext            2718 non-null   category
 6   CLast            2718 non-null   category
 7   CTransp          2718 non-null   category
 8   IrriType         2718 non-null   category
 9   IrriSource       2718 non-null   category
 10  IrriCount        2718 non-null   int64   
 11  WaterCov         2718 non-null   int64   
 12  ExpYield         2718 non-null   int64   
 13  Season           2718 non-null   category
 14  ndvi             2718 non-null   float64 
 15  evi              2718 non-null   float64 
 16  ndwi             2718 non-null   float64

### **ENCODE _Categorical_ FEATURES**

In [36]:
for col in train_cat_cols:
    train[col] = train[col].cat.codes

In [37]:
for col in test_cat_cols:
    test[col] = test[col].cat.codes

### **MODEL BUILDING**

In [38]:
X = train.drop(['category'], axis=1)
y = train['category']

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### **SCALING**

In [40]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

### **SMOTE**

In [41]:
# Oversampling the minority class
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

### **XGBOOST**

In [42]:
xgbmodel = xgb.XGBClassifier(objective='multi:softmax', 
                          booster='gbtree',
                          num_class=4, 
                          enable_categorical=True, 
                          n_estimators=500,
                          random_state=42)

In [43]:
xgbmodel.fit(X_train, y_train)

In [44]:
xgbpred = xgbmodel.predict(X_test)

In [45]:
f1_score(y_test, xgbpred, average='weighted')

0.7476526566634553

### **CATBOOST CLASSIFIER**

In [46]:
from catboost import CatBoostClassifier

In [47]:
catmodel = CatBoostClassifier(iterations=500, 
                              learning_rate=0.3, 
                              depth=6, 
                              loss_function='MultiClass', 
                              random_state=42, 
                              class_weights=[1, 4, 4, 4])

In [48]:
catmodel.fit(X_train, y_train)

0:	learn: 1.3192541	total: 235ms	remaining: 1m 57s
1:	learn: 1.2493498	total: 288ms	remaining: 1m 11s
2:	learn: 1.2008575	total: 340ms	remaining: 56.3s
3:	learn: 1.1577173	total: 387ms	remaining: 47.9s
4:	learn: 1.1203645	total: 440ms	remaining: 43.5s
5:	learn: 1.0918818	total: 499ms	remaining: 41.1s
6:	learn: 1.0687300	total: 549ms	remaining: 38.6s
7:	learn: 1.0575698	total: 602ms	remaining: 37s
8:	learn: 1.0311406	total: 672ms	remaining: 36.7s
9:	learn: 1.0095115	total: 721ms	remaining: 35.3s
10:	learn: 0.9883965	total: 769ms	remaining: 34.2s
11:	learn: 0.9725181	total: 816ms	remaining: 33.2s
12:	learn: 0.9544095	total: 881ms	remaining: 33s
13:	learn: 0.9431700	total: 933ms	remaining: 32.4s
14:	learn: 0.9250700	total: 980ms	remaining: 31.7s
15:	learn: 0.9142711	total: 1.04s	remaining: 31.3s
16:	learn: 0.8947039	total: 1.1s	remaining: 31.2s
17:	learn: 0.8818856	total: 1.16s	remaining: 31s
18:	learn: 0.8682269	total: 1.21s	remaining: 30.6s
19:	learn: 0.8600442	total: 1.26s	remaining: 3

<catboost.core.CatBoostClassifier at 0x2cec3152c90>

In [49]:
catpred = catmodel.predict(X_test) 

In [50]:
f1_score(y_test, catpred, average='weighted')

0.7424433651787711

### **LIGHTGBM CLASSIFIER**

In [51]:
import lightgbm as lgb

In [52]:
light_model = lgb.LGBMClassifier(objective='multiclass', 
                                 random_state=42, 
                                 n_estimators=500,
                                 learning_rate=0.3)

In [53]:
light_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009044 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7797
[LightGBM] [Info] Number of data points in the train set: 20804, number of used features: 33
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294


In [54]:
light_pred = light_model.predict(X_test)

In [55]:
f1_score(y_test, light_pred, average='weighted')

0.7489796301629361

### **RANDOMFOREST CLASSIFIER**

In [56]:
clf = RandomForestClassifier(n_estimators=131, 
                             max_depth=28,
                             min_samples_split=8,
                             min_samples_leaf=1,
                             max_features='log2',
                             random_state=42)

In [57]:
clf.fit(X_train, y_train)

In [58]:
clf_pred = clf.predict(X_test)

In [59]:
f1_score(y_test, clf_pred, average='weighted')

0.7464891337996892

### **PREDICTION ON TEST SET**

In [60]:
test_pred = light_model.predict(test)
test_pred

array([1, 1, 1, ..., 1, 1, 1], dtype=int8)

In [61]:
submission = pd.DataFrame(
    {
        'ID': test.index,
        'Target': test_pred
    }
)

In [62]:
category_map = {0: 'Diseased', 1: 'Healthy', 2: 'Pests', 3: 'Stressed'}

submission['Target'] = submission['Target'].map(category_map)
submission.head(3)

Unnamed: 0,ID,Target
0,85197,Healthy
1,779677,Healthy
2,1331840,Healthy


In [63]:
submission.to_csv('light.csv', index=False)

In [64]:
sub = pd.read_csv('light.csv')
len(sub)

2718

### **OPTIMIZING WITH OPTUNA**

In [60]:
import optuna
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score

In [None]:
def objective(trial):
    # Suggest hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 32)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2'])

    # Initialize the classifier with suggested hyperparameters
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=42
    )

    # Perform cross-validation
    f1_scorer = make_scorer(f1_score, average='weighted')
    scores = cross_val_score(clf, X_train, y_train, cv=3, scoring=f1_scorer)
    f1 = scores.mean()
    return f1

# Create a study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)

# Get the best hyperparameters
best_params = study.best_params
best_params

[I 2025-01-27 17:46:08,385] A new study created in memory with name: no-name-3a9fe9d8-ef7f-4f68-817a-199641cbb1ea


[W 2025-01-27 17:46:57,738] Trial 0 failed with parameters: {'n_estimators': 232, 'max_depth': 11, 'min_samples_split': 9, 'min_samples_leaf': 4, 'max_features': 'sqrt'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\Aduragbemi\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\Aduragbemi\AppData\Local\Temp\ipykernel_10940\3112833481.py", line 21, in objective
    scores = cross_val_score(clf, X_train, y_train, cv=3, scoring=f1_scorer)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Aduragbemi\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Aduragbemi\AppData\Local\Programs\Python\Python312\Lib

KeyboardInterrupt: 

In [127]:
# Train the model with the best hyperparameters
best_model = RandomForestClassifier(**best_params, random_state=42)
best_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = best_model.predict(X_test)

# Evaluate the model
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 Score with best parameters:", f1)

F1 Score with best parameters: 0.7571966505308745


In [128]:
ram = RandomForestClassifier(n_estimators=150, 
                             class_weight='balanced', 
                             random_state=42)

In [129]:
ram.fit(X_train, y_train)

In [130]:
rampred = ram.predict(X_test)

In [131]:
f1_score(y_test, rampred, average='weighted')

0.7545604656166601