In [1]:
# Supress Warnings
import warnings
warnings.filterwarnings('ignore')

# Visualization
import ipyleaflet
import matplotlib.pyplot as plt
from IPython.display import Image
import seaborn as sns

# Data Science
import numpy as np
import pandas as pd
import statsmodels.api as sm

# Feature Engineering
from sklearn.model_selection import train_test_split

# Machine Learning
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import r2_score


# Planetary Computer Tools
import pystac
import pystac_client
import odc
from pystac_client import Client
from pystac.extensions.eo import EOExtension as eo
from odc.stac import stac_load
import planetary_computer as pc

# Please pass your API key here
pc.settings.set_subscription_key('1b6b6eb7c08a4e58a88ab5e9062492f3')

# Others
import requests
import rich.table
from itertools import cycle
from tqdm import tqdm
tqdm.pandas()
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

In [23]:
crop_yield_data = pd.read_csv("Crop_Yield_Data_challenge_2.csv")
crop_yield_data

Unnamed: 0,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Rice Yield (kg/ha)
0,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.40,5500
1,Chau_Phu,10.509150,105.265098,SA,T,15-07-2022,2.43,6000
2,Chau_Phu,10.467721,105.192464,SA,D,15-07-2022,1.95,6400
3,Chau_Phu,10.494453,105.241281,SA,T,15-07-2022,4.30,6000
4,Chau_Phu,10.535058,105.252744,SA,D,14-07-2022,3.30,6400
...,...,...,...,...,...,...,...,...
552,Thoai_Son,10.364419,105.164984,WS,T,12-04-2022,7.80,6640
553,Thoai_Son,10.358094,105.189541,WS,T,12-04-2022,2.00,7200
554,Thoai_Son,10.368014,105.238516,WS,T,12-04-2022,6.20,7200
555,Thoai_Son,10.275419,105.234563,WS,T,20-04-2022,3.00,6400


In [24]:
vh_vv_data = pd.read_csv("combined_vv_vh_L2.csv")
vh_vv_data = vh_vv_data.loc[:, ['vv_list', 'vh_list','vv/vh_list']]
vh_vv_data

Unnamed: 0,vv_list,vh_list,vv/vh_list
0,"[0.130476713180542, 0.35729560256004333, 0.269...","[0.00524574751034379, 0.02901931293308735, 0.0...","[24.872854235409235, 12.31233845487295, 9.3034..."
1,"[0.2941001057624817, 0.2624836564064026, 0.285...","[0.054993245750665665, 0.05291072651743889, 0....","[5.347931400446968, 4.960877948252901, 5.81369..."
2,"[0.4241330027580261, 0.03778503090143204, 0.19...","[0.11418552696704865, 0.031229600310325623, 0....","[3.7144199796916584, 1.209910806605455, 3.5922..."
3,"[0.33549439907073975, 0.12846091389656067, 0.0...","[0.023823164403438568, 0.027058769017457962, 0...","[14.082696714392629, 4.747478121184278, 7.1595..."
4,"[0.13732333481311798, 0.2205643355846405, 0.15...","[0.03822758048772812, 0.08936841040849686, 0.0...","[3.592258078096304, 2.468034673286188, 6.26108..."
...,...,...,...
552,"[0.014901062473654747, 0.2726139724254608, 0.4...","[0.002414182759821415, 0.011826231144368649, 0...","[6.172300921723519, 23.05163573225716, 37.1923..."
553,"[0.12398772686719894, 0.22288843989372253, 0.1...","[0.003157197032123804, 0.08050604164600372, 0....","[39.27145680350335, 2.7685927085298028, 7.4771..."
554,"[0.1457187384366989, 0.19863098859786987, 0.24...","[0.01400761678814888, 0.0244789756834507, 0.05...","[10.402821596317798, 8.114350500873151, 4.8945..."
555,"[0.11684748530387878, 0.03332364931702614, 0.0...","[0.017325101420283318, 0.012673512101173401, 0...","[6.744404114545609, 2.629393419203885, 2.71715..."


In [25]:
vh_vv_data.dtypes

vv_list       object
vh_list       object
vv/vh_list    object
dtype: object

In [26]:
import ast
# convert string to list
vh_vv_data['vv_list'] = vh_vv_data['vv_list'].apply(ast.literal_eval)
vh_vv_data['vh_list'] = vh_vv_data['vh_list'].apply(ast.literal_eval)
vh_vv_data['vv/vh_list'] = vh_vv_data['vv/vh_list'].apply(ast.literal_eval)
print(vh_vv_data)

                                               vv_list  \
0    [0.130476713180542, 0.35729560256004333, 0.269...   
1    [0.2941001057624817, 0.2624836564064026, 0.285...   
2    [0.4241330027580261, 0.03778503090143204, 0.19...   
3    [0.33549439907073975, 0.12846091389656067, 0.0...   
4    [0.13732333481311798, 0.2205643355846405, 0.15...   
..                                                 ...   
552  [0.014901062473654747, 0.2726139724254608, 0.4...   
553  [0.12398772686719894, 0.22288843989372253, 0.1...   
554  [0.1457187384366989, 0.19863098859786987, 0.24...   
555  [0.11684748530387878, 0.03332364931702614, 0.0...   
556  [0.2840534448623657, 0.3372976779937744, 0.639...   

                                               vh_list  \
0    [0.00524574751034379, 0.02901931293308735, 0.0...   
1    [0.054993245750665665, 0.05291072651743889, 0....   
2    [0.11418552696704865, 0.031229600310325623, 0....   
3    [0.023823164403438568, 0.027058769017457962, 0...   
4    [0.03822

In [27]:
def ordinal_distribution(data, dx=3, dy=1, taux=1, tauy=1, return_missing=False, tie_precision=None):
    '''
    Returns
    -------
     : tuple
       Tuple containing two arrays, one with the ordinal patterns occurring in data 
       and another with their corresponding probabilities.
       
    Attributes
    ---------
    data : array 
           Array object in the format :math:`[x_{1}, x_{2}, x_{3}, \\ldots ,x_{n}]`
           or  :math:`[[x_{11}, x_{12}, x_{13}, \\ldots, x_{1m}],
           \\ldots, [x_{n1}, x_{n2}, x_{n3}, \\ldots, x_{nm}]]`.
    dx : int
         Embedding dimension (horizontal axis) (default: 3).
    dy : int
         Embedding dimension (vertical axis); it must be 1 for time series 
         (default: 1).
    taux : int
           Embedding delay (horizontal axis) (default: 1).
    tauy : int
           Embedding delay (vertical axis) (default: 1).
    return_missing: boolean
                    If `True`, it returns ordinal patterns not appearing in the 
                    symbolic sequence obtained from **data** are shown. If `False`,
                    these missing patterns (permutations) are omitted 
                    (default: `False`).
    tie_precision : int
                    If not `None`, **data** is rounded with `tie_precision`
                    number of decimals (default: `None`).
   
    '''
    def setdiff(a, b):
        '''
        Returns
        -------
        : array
            An array containing the elements in `a` that are not contained in `b`.
            
        Parameters
        ----------    
        a : tuples, lists or arrays
            Array in the format :math:`[[x_{21}, x_{22}, x_{23}, \\ldots, x_{2m}], 
            \\ldots, [x_{n1}, x_{n2}, x_{n3}, ..., x_{nm}]]`.
        b : tuples, lists or arrays
            Array in the format :math:`[[x_{21}, x_{22}, x_{23}, \\ldots, x_{2m}], 
            \\ldots, [x_{n1}, x_{n2}, x_{n3}, ..., x_{nm}]]`.
        '''

        a = np.asarray(a).astype('int64')
        b = np.asarray(b).astype('int64')

        _, ncols = a.shape

        dtype={'names':['f{}'.format(i) for i in range(ncols)],
            'formats':ncols * [a.dtype]}

        C = np.setdiff1d(a.view(dtype), b.view(dtype))
        C = C.view(a.dtype).reshape(-1, ncols)

        return(C)

    try:
        ny, nx = np.shape(data)
        data   = np.array(data)
    except:
        nx     = np.shape(data)[0]
        ny     = 1
        data   = np.array([data])

    if tie_precision is not None:
        data = np.round(data, tie_precision)

    partitions = np.concatenate(
        [
            [np.concatenate(data[j:j+dy*tauy:tauy,i:i+dx*taux:taux]) for i in range(nx-(dx-1)*taux)] 
            for j in range(ny-(dy-1)*tauy)
        ]
    )

    symbols = np.apply_along_axis(np.argsort, 1, partitions)
    symbols, symbols_count = np.unique(symbols, return_counts=True, axis=0)

    probabilities = symbols_count/len(partitions)

    if return_missing==False:
        return symbols, probabilities
    
    else:
        all_symbols   = list(map(list,list(itertools.permutations(np.arange(dx*dy)))))
        miss_symbols  = setdiff(all_symbols, symbols)
        symbols       = np.concatenate((symbols, miss_symbols))
        probabilities = np.concatenate((probabilities, np.zeros(miss_symbols.__len__())))
        
        return symbols, probabilities

In [28]:
def permutation_entropy(data, dx=3, dy=1, taux=1, tauy=1, base=2, normalized=True, probs=False, tie_precision=None):
    '''
    Returns Permutation Entropy
    Attributes:
    data : array
           Array object in the format :math:`[x_{1}, x_{2}, x_{3}, \\ldots ,x_{n}]`
           or  :math:`[[x_{11}, x_{12}, x_{13}, \\ldots, x_{1m}],
           \\ldots, [x_{n1}, x_{n2}, x_{n3}, \\ldots, x_{nm}]]`
           or an ordinal probability distribution (such as the ones returned by :func:`ordpy.ordinal_distribution`).
    dx :   int
           Embedding dimension (horizontal axis) (default: 3).
    dy :   int
           Embedding dimension (vertical axis); it must be 1 for time series (default: 1).
    taux : int
           Embedding delay (horizontal axis) (default: 1).
    tauy : int
           Embedding delay (vertical axis) (default: 1).
    base : str, int
           Logarithm base in Shannon's entropy. Either 'e' or 2 (default: 2).
    normalized: boolean
                If `True`, permutation entropy is normalized by its maximum value 
                (default: `True`). If `False`, it is not.
    probs : boolean
            If `True`, assumes **data** is an ordinal probability distribution. If 
            `False`, **data** is expected to be a one- or two-dimensional 
            array (default: `False`). 
    tie_precision : int
                    If not `None`, **data** is rounded with `tie_precision`
                    number of decimals (default: `None`).
    '''
    if not probs:
        _, probabilities = ordinal_distribution(data, dx, dy, taux, tauy, return_missing=False, tie_precision=tie_precision)
    else:
        probabilities = np.asarray(data)
        probabilities = probabilities[probabilities>0]

    if normalized==True and base in [2, '2']:        
        smax = np.log2(float(np.math.factorial(dx*dy)))
        s    = -np.sum(probabilities*np.log2(probabilities))
        return s/smax
         
    elif normalized==True and base=='e':        
        smax = np.log(float(np.math.factorial(dx*dy)))
        s    = -np.sum(probabilities*np.log(probabilities))
        return s/smax
    
    elif normalized==False and base in [2, '2']:
        return -np.sum(probabilities*np.log2(probabilities))
    else:
        return -np.sum(probabilities*np.log(probabilities))

In [29]:
def generate_stastical_features(dataframe):
    '''
    Returns a  list of statistical features such as min,max,range,mean,auto-correlation,permutation entropy for each of the features
    Attributes:
    dataframe - DataFrame consisting of VV,VH and VV/VH for a time period
    '''
    features_list = []
    for index, row in dataframe.iterrows():
        min_vv =min(row[0])
        max_vv = max(row[0])
        range_vv = max_vv - min_vv
        mean_vv = np.mean(row[0])
        correlation_vv = sm.tsa.acf(row[0])[1]
        permutation_entropy_vv = permutation_entropy(row[0], dx=6,base=2, normalized=True) 
        
    
        min_vh = min(row[1])
        max_vh = max(row[1])
        range_vh = max_vh - min_vh
        mean_vh = np.mean(row[1])
        correlation_vh = sm.tsa.acf(row[1])[1]
        permutation_entropy_vh = permutation_entropy(row[1], dx=6, base=2, normalized=True)
    
        min_vv_by_vh = min(row[2])
        max_vv_by_vh = max(row[2])
        range_vv_by_vh = max_vv_by_vh - min_vv_by_vh
        mean_vv_by_vh = np.mean(row[2])
        correlation_vv_by_vh = sm.tsa.acf(row[2])[1]
        permutation_entropy_vv_by_vh = permutation_entropy(row[2], dx=6, base=2, normalized=True)
        
        
    
        features_list.append([min_vv, max_vv, range_vv, mean_vv, correlation_vv, permutation_entropy_vv,
                          min_vh, max_vh, range_vh,  mean_vh, correlation_vh, permutation_entropy_vh,
                          min_vv_by_vh,  max_vv_by_vh, range_vv_by_vh, mean_vv_by_vh, correlation_vv_by_vh, permutation_entropy_vv_by_vh])
    return features_list

In [30]:
# Generating Statistical Features for VV,VH and VV/VH and creating a dataframe
features = generate_stastical_features(vh_vv_data)
features_data = pd.DataFrame(features ,columns = ['min_vv', 'max_vv', 'range_vv', 'mean_vv', 'correlation_vv', 'permutation_entropy_vv',
                          'min_vh', 'max_vh', 'range_vh', 'mean_vh', 'correlation_vh', 'permutation_entropy_vh',
                          'min_vv_by_vh',  'max_vv_by_vh', 'range_vv_by_vh', 'mean_vv_by_vh', 'correlation_vv_by_vh', 'permutation_entropy_vv_by_vh'] )

In [31]:
features_data

Unnamed: 0,min_vv,max_vv,range_vv,mean_vv,correlation_vv,permutation_entropy_vv,min_vh,max_vh,range_vh,mean_vh,correlation_vh,permutation_entropy_vh,min_vv_by_vh,max_vv_by_vh,range_vv_by_vh,mean_vv_by_vh,correlation_vv_by_vh,permutation_entropy_vv_by_vh
0,0.017408,0.357296,0.339888,0.111277,0.621781,0.333963,0.002698,0.051663,0.048966,0.021944,0.042602,0.333963,1.130426,24.872854,23.742429,6.721199,0.342336,0.333963
1,0.053835,0.294100,0.240265,0.166436,0.186905,0.333963,0.016737,0.072512,0.055774,0.038232,0.201433,0.333963,1.580250,7.805157,6.224906,4.583483,-0.212126,0.310551
2,0.016936,0.627878,0.610943,0.214448,0.200432,0.333963,0.008219,0.114186,0.105967,0.036390,0.026166,0.333963,0.994616,46.363490,45.368874,9.063073,0.171531,0.333963
3,0.023625,0.335494,0.311869,0.116675,0.219432,0.333963,0.001772,0.082565,0.080793,0.025339,0.192489,0.333963,1.664036,28.349087,26.685050,7.416280,0.502445,0.333963
4,0.053390,0.859671,0.806281,0.169892,-0.002048,0.389854,0.007482,0.098161,0.090679,0.033238,-0.039257,0.389854,1.338847,29.232963,27.894115,6.989110,0.520591,0.389854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,0.014901,0.435711,0.420810,0.133584,0.249518,0.349977,0.002414,0.048116,0.045702,0.025570,0.073995,0.349977,1.005684,37.192317,36.186633,7.713123,0.356639,0.349977
553,0.020668,0.222888,0.202220,0.093267,0.438803,0.349977,0.003157,0.080506,0.077349,0.026299,-0.053630,0.349977,1.310433,39.271457,37.961023,7.178713,-0.027531,0.349977
554,0.017418,0.401048,0.383630,0.144286,0.273595,0.349977,0.003225,0.114566,0.111341,0.036457,0.350006,0.349977,1.365598,10.402822,9.037224,4.790258,0.326386,0.349977
555,0.019068,0.321706,0.302638,0.110304,0.327969,0.349977,0.002706,0.063610,0.060904,0.026320,0.501659,0.349977,0.852697,22.231642,21.378945,5.848876,0.194271,0.349977


In [32]:
def combine_two_datasets(dataset1,dataset2):
    '''
    Returns a  vertically concatenated dataset.
    Attributes:
    dataset1 - Dataset 1 to be combined 
    dataset2 - Dataset 2 to be combined
    '''
    data = pd.concat([dataset1,dataset2], axis=1)
    return data

In [33]:
crop_data = combine_two_datasets(crop_yield_data,features_data)
crop_data

Unnamed: 0,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Rice Yield (kg/ha),min_vv,max_vv,...,range_vh,mean_vh,correlation_vh,permutation_entropy_vh,min_vv_by_vh,max_vv_by_vh,range_vv_by_vh,mean_vv_by_vh,correlation_vv_by_vh,permutation_entropy_vv_by_vh
0,Chau_Phu,10.510542,105.248554,SA,T,15-07-2022,3.40,5500,0.017408,0.357296,...,0.048966,0.021944,0.042602,0.333963,1.130426,24.872854,23.742429,6.721199,0.342336,0.333963
1,Chau_Phu,10.509150,105.265098,SA,T,15-07-2022,2.43,6000,0.053835,0.294100,...,0.055774,0.038232,0.201433,0.333963,1.580250,7.805157,6.224906,4.583483,-0.212126,0.310551
2,Chau_Phu,10.467721,105.192464,SA,D,15-07-2022,1.95,6400,0.016936,0.627878,...,0.105967,0.036390,0.026166,0.333963,0.994616,46.363490,45.368874,9.063073,0.171531,0.333963
3,Chau_Phu,10.494453,105.241281,SA,T,15-07-2022,4.30,6000,0.023625,0.335494,...,0.080793,0.025339,0.192489,0.333963,1.664036,28.349087,26.685050,7.416280,0.502445,0.333963
4,Chau_Phu,10.535058,105.252744,SA,D,14-07-2022,3.30,6400,0.053390,0.859671,...,0.090679,0.033238,-0.039257,0.389854,1.338847,29.232963,27.894115,6.989110,0.520591,0.389854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,Thoai_Son,10.364419,105.164984,WS,T,12-04-2022,7.80,6640,0.014901,0.435711,...,0.045702,0.025570,0.073995,0.349977,1.005684,37.192317,36.186633,7.713123,0.356639,0.349977
553,Thoai_Son,10.358094,105.189541,WS,T,12-04-2022,2.00,7200,0.020668,0.222888,...,0.077349,0.026299,-0.053630,0.349977,1.310433,39.271457,37.961023,7.178713,-0.027531,0.349977
554,Thoai_Son,10.368014,105.238516,WS,T,12-04-2022,6.20,7200,0.017418,0.401048,...,0.111341,0.036457,0.350006,0.349977,1.365598,10.402822,9.037224,4.790258,0.326386,0.349977
555,Thoai_Son,10.275419,105.234563,WS,T,20-04-2022,3.00,6400,0.019068,0.321706,...,0.060904,0.026320,0.501659,0.349977,0.852697,22.231642,21.378945,5.848876,0.194271,0.349977


##Model Building

In [34]:
crop_data = crop_data[['min_vv', 'max_vv', 'range_vv', 'mean_vv', 'correlation_vv', 'permutation_entropy_vv',
                          'min_vh', 'max_vh', 'range_vh', 'mean_vh', 'correlation_vh', 'permutation_entropy_vh',
                          'min_vv_by_vh',  'max_vv_by_vh', 'range_vv_by_vh', 'mean_vv_by_vh', 'correlation_vv_by_vh', 'permutation_entropy_vv_by_vh','Rice Yield (kg/ha)']]

In [35]:
crop_data

Unnamed: 0,min_vv,max_vv,range_vv,mean_vv,correlation_vv,permutation_entropy_vv,min_vh,max_vh,range_vh,mean_vh,correlation_vh,permutation_entropy_vh,min_vv_by_vh,max_vv_by_vh,range_vv_by_vh,mean_vv_by_vh,correlation_vv_by_vh,permutation_entropy_vv_by_vh,Rice Yield (kg/ha)
0,0.017408,0.357296,0.339888,0.111277,0.621781,0.333963,0.002698,0.051663,0.048966,0.021944,0.042602,0.333963,1.130426,24.872854,23.742429,6.721199,0.342336,0.333963,5500
1,0.053835,0.294100,0.240265,0.166436,0.186905,0.333963,0.016737,0.072512,0.055774,0.038232,0.201433,0.333963,1.580250,7.805157,6.224906,4.583483,-0.212126,0.310551,6000
2,0.016936,0.627878,0.610943,0.214448,0.200432,0.333963,0.008219,0.114186,0.105967,0.036390,0.026166,0.333963,0.994616,46.363490,45.368874,9.063073,0.171531,0.333963,6400
3,0.023625,0.335494,0.311869,0.116675,0.219432,0.333963,0.001772,0.082565,0.080793,0.025339,0.192489,0.333963,1.664036,28.349087,26.685050,7.416280,0.502445,0.333963,6000
4,0.053390,0.859671,0.806281,0.169892,-0.002048,0.389854,0.007482,0.098161,0.090679,0.033238,-0.039257,0.389854,1.338847,29.232963,27.894115,6.989110,0.520591,0.389854,6400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
552,0.014901,0.435711,0.420810,0.133584,0.249518,0.349977,0.002414,0.048116,0.045702,0.025570,0.073995,0.349977,1.005684,37.192317,36.186633,7.713123,0.356639,0.349977,6640
553,0.020668,0.222888,0.202220,0.093267,0.438803,0.349977,0.003157,0.080506,0.077349,0.026299,-0.053630,0.349977,1.310433,39.271457,37.961023,7.178713,-0.027531,0.349977,7200
554,0.017418,0.401048,0.383630,0.144286,0.273595,0.349977,0.003225,0.114566,0.111341,0.036457,0.350006,0.349977,1.365598,10.402822,9.037224,4.790258,0.326386,0.349977,7200
555,0.019068,0.321706,0.302638,0.110304,0.327969,0.349977,0.002706,0.063610,0.060904,0.026320,0.501659,0.349977,0.852697,22.231642,21.378945,5.848876,0.194271,0.349977,6400


In [36]:
from sklearn.ensemble import AdaBoostRegressor

In [37]:
X = crop_data.drop(columns=['Rice Yield (kg/ha)']).values
y = crop_data ['Rice Yield (kg/ha)'].values
# Choose any random state
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=21)

In [38]:
# Create an AdaBoost regressor
regressor = AdaBoostRegressor()

# Train the model on the training set
regressor.fit(X_train, y_train)



AdaBoostRegressor()

#Model Evaluation

In [39]:
insample_predictions = regressor.predict(X_train)
print("Insample R2 Score: {0:.2f}".format(r2_score(y_train,insample_predictions)))

Insample R2 Score: 0.69


In [40]:
outsample_predictions = regressor.predict(X_test)
print("Outsample R2 Score: {0:.2f}".format(r2_score(y_test,outsample_predictions)))

Outsample R2 Score: 0.50


#Submission

In [41]:
test_file = pd.read_csv('challenge_2_submission_template.csv')
test_file

Unnamed: 0,ID No,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Predicted Rice Yield (kg/ha)
0,1,Chau_Phu,10.542192,105.187920,WS,T,10-04-2022,1.40,
1,2,Chau_Thanh,10.400189,105.331053,SA,T,15-07-2022,1.32,
2,3,Chau_Phu,10.505489,105.203926,SA,D,14-07-2022,1.40,
3,4,Chau_Phu,10.523520,105.138274,WS,D,10-04-2022,1.80,
4,5,Thoai_Son,10.294660,105.248528,SA,T,20-07-2022,2.20,
...,...,...,...,...,...,...,...,...,...
95,96,Chau_Thanh,10.435839,105.132981,SA,D,26-07-2022,1.21,
96,97,Chau_Phu,10.529357,105.147388,WS,T,10-04-2022,2.00,
97,98,Chau_Thanh,10.452537,105.205118,SA,T,20-07-2022,5.50,
98,99,Chau_Thanh,10.394341,105.126836,SA,T,14-07-2022,4.40,


In [42]:
submission_vh_vv_data=pd.read_csv("submission_vh_vv_L2.csv")
submission_vh_vv_data

Unnamed: 0,vv_list,vh_list,vv/vh_list
0,"[0.023064468055963516, 0.26543858647346497, 0....","[0.006820742506533861, 0.006359191611409187, 0...","[3.3815186592763387, 41.740932290392834, 17.31..."
1,"[0.23529212176799774, 0.10290589183568954, 0.0...","[0.03147885203361511, 0.053898051381111145, 0....","[7.474609350961653, 1.9092692444119317, 4.3450..."
2,"[0.06630066782236099, 0.3092383146286011, 0.72...","[0.006729983724653721, 0.02353968843817711, 0....","[9.851534644799216, 13.13689072141977, 35.1443..."
3,"[0.3715355694293976, 0.04534188285470009, 0.12...","[0.01624951884150505, 0.019233113154768944, 0....","[22.864404358879188, 2.357490567950896, 6.3275..."
4,"[0.28587397933006287, 0.1200699508190155, 0.22...","[0.013317599892616272, 0.017110824584960938, 0...","[21.465878359099907, 7.017192550997659, 14.476..."
...,...,...,...
95,"[0.15140028297901154, 0.1215878576040268, 0.20...","[0.0481574721634388, 0.019104361534118652, 0.0...","[3.1438585992467187, 6.364403091246046, 10.545..."
96,"[0.03678244724869728, 0.2085420787334442, 0.08...","[0.0031968739349395037, 0.011263172142207623, ...","[11.505754683252261, 18.51539478402833, 11.077..."
97,"[0.15271368622779846, 0.034302275627851486, 0....","[0.04119793325662613, 0.013300511054694653, 0....","[3.706828817759603, 2.579019369014688, 2.17305..."
98,"[0.0686613917350769, 0.33833763003349304, 0.25...","[0.0037127695977687836, 0.05765503644943237, 0...","[18.493308008215614, 5.868310053541282, 14.554..."


In [43]:
import ast
# convert string to list
submission_vh_vv_data['vv_list'] = submission_vh_vv_data['vv_list'].apply(ast.literal_eval)
submission_vh_vv_data['vh_list'] = submission_vh_vv_data['vh_list'].apply(ast.literal_eval)
submission_vh_vv_data['vv/vh_list'] = submission_vh_vv_data['vv/vh_list'].apply(ast.literal_eval)
print(submission_vh_vv_data)

                                              vv_list  \
0   [0.023064468055963516, 0.26543858647346497, 0....   
1   [0.23529212176799774, 0.10290589183568954, 0.0...   
2   [0.06630066782236099, 0.3092383146286011, 0.72...   
3   [0.3715355694293976, 0.04534188285470009, 0.12...   
4   [0.28587397933006287, 0.1200699508190155, 0.22...   
..                                                ...   
95  [0.15140028297901154, 0.1215878576040268, 0.20...   
96  [0.03678244724869728, 0.2085420787334442, 0.08...   
97  [0.15271368622779846, 0.034302275627851486, 0....   
98  [0.0686613917350769, 0.33833763003349304, 0.25...   
99  [0.166156604886055, 0.05066028982400894, 0.099...   

                                              vh_list  \
0   [0.006820742506533861, 0.006359191611409187, 0...   
1   [0.03147885203361511, 0.053898051381111145, 0....   
2   [0.006729983724653721, 0.02353968843817711, 0....   
3   [0.01624951884150505, 0.019233113154768944, 0....   
4   [0.013317599892616272, 0.0

In [44]:
# Generating Statistical Features for VV,VH and VV/VH and creating a dataframe
features = generate_stastical_features(submission_vh_vv_data)
submission_features_data = pd.DataFrame(features ,columns = ['min_vv', 'max_vv', 'range_vv', 'mean_vv', 'correlation_vv', 'permutation_entropy_vv',
                          'min_vh', 'max_vh', 'range_vh', 'mean_vh', 'correlation_vh', 'permutation_entropy_vh',
                          'min_vv_by_vh',  'max_vv_by_vh', 'range_vv_by_vh', 'mean_vv_by_vh', 'correlation_vv_by_vh', 'permutation_entropy_vv_by_vh'] )

In [45]:
#Making predictions
final_predictions = regressor.predict(submission_features_data.values)
final_prediction_series = pd.Series(final_predictions)

In [46]:
#Combining the results into dataframe
test_file['Predicted Rice Yield (kg/ha)']=list(final_prediction_series)
test_file

Unnamed: 0,ID No,District,Latitude,Longitude,"Season(SA = Summer Autumn, WS = Winter Spring)","Rice Crop Intensity(D=Double, T=Triple)",Date of Harvest,Field size (ha),Predicted Rice Yield (kg/ha)
0,1,Chau_Phu,10.542192,105.187920,WS,T,10-04-2022,1.40,7209.172932
1,2,Chau_Thanh,10.400189,105.331053,SA,T,15-07-2022,1.32,6009.090909
2,3,Chau_Phu,10.505489,105.203926,SA,D,14-07-2022,1.40,6015.294118
3,4,Chau_Phu,10.523520,105.138274,WS,D,10-04-2022,1.80,7385.486726
4,5,Thoai_Son,10.294660,105.248528,SA,T,20-07-2022,2.20,5906.666667
...,...,...,...,...,...,...,...,...,...
95,96,Chau_Thanh,10.435839,105.132981,SA,D,26-07-2022,1.21,5857.391304
96,97,Chau_Phu,10.529357,105.147388,WS,T,10-04-2022,2.00,6993.333333
97,98,Chau_Thanh,10.452537,105.205118,SA,T,20-07-2022,5.50,5957.777778
98,99,Chau_Thanh,10.394341,105.126836,SA,T,14-07-2022,4.40,6030.800000


In [47]:
#Dumping the predictions into a csv file.
test_file.to_csv("challenge_2_submission_rice_crop_yield_prediction_adaboost.csv",index = False)