In [None]:
import ee
from google.colab import drive
#drive.mount('data')
drive.mount('/content/drive')
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize(project='abharajan')

import sys
sys.path.append('/content/drive/My Drive/BiomassApril2024')

import geemap

import matplotlib.pyplot as plt
import seaborn as sns

from matplotlib.backends.backend_pdf import PdfPages

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.metrics import r2_score

import numpy as np
import pandas as pd

import utilceda as uc
import coordinates as cdn


Mounted at /content/drive


In [None]:
# Choose coordinates to choose datasets

#coordinatesList = cdn.clIndia
#placeList = cdn.plIndia

coordinatesList = [[38.99458221563848, 6.463386197712386]]
placeList = ['Africa' ]

# no of places
npl = len(placeList)

# projections
projNameAr = ['S2', 'CCI', 'general']
nproj = len(projNameAr)

dataFolder = 'drive/MyDrive/projectionStudy/dataProjectionStudy/'
plotFolder = 'drive/MyDrive/projectionStudy/'


# EDA and model fits - write fit results to a file and save EDA plots

In [None]:
# ****************************************************************
# EDA and Models - Saving the figures to one file for each projection
# ****************************************************************


for j in range(0, nproj):
  cciProjName = projNameAr[j]

  for k in range(0, nproj):
    s2ProjName = projNameAr[k]

    for l in range(0, nproj):
      allProjName = projNameAr[l]

      projectionType = 'S2' + s2ProjName + 'CCI' + cciProjName + 'All' + allProjName

      # String to use in plot titles
      placeProjection = placeList[0] + ' ' + projectionType

      # make dataframe
      filename =  dataFolder + placeList[0] + '_' + projectionType + '.csv'
      dfC, dfCS = uc.cleanDF(filename)

      # Model predictions
      pred = uc.modelPred(dfC)
      r2Ar = pred[2]
      rmseAr = pred[3]
      modelNames = pred[4]

      # Write results to file
      f = plotFolder + placeList[0] + projectionType
      uc.EDAdf(dfC).to_csv(f+'_EDA.dat', mode='w') # EDA results
      dfC.corr().to_csv(f+'_corr.dat', mode='w')   # correlation matrix
      uc.modelResultsToDF(r2Ar, rmseAr, modelNames).to_csv(f+'_ModelResults.dat', mode='w') # model results

      # Create plots, placeProjection is string that will be used in plot titles
      figHL, figTL = uc.EDAplotsL(placeProjection, dfC, dfCS)
      figTA, figHA = uc.EDAplotsAll(placeProjection, dfC, dfCS)
      figCorrA, figCorrL = uc.corrPlots(placeProjection, dfC)
      figModels = uc.modelPlots(placeProjection, pred)

      # Save figures to file
      pdfFile = plotFolder + placeList[0] + projectionType + '.pdf'
      p = PdfPages(pdfFile)

      figTL.savefig(p, format='pdf')
      figHL.savefig(p, format='pdf')
      figCorrL.savefig(p, format='pdf')
      figModels.savefig(p, format='pdf')
      figHA.savefig(p, format='pdf')
      figTA.savefig(p, format='pdf')
      figCorrA.savefig(p, format='pdf')

      p.close()

      # Close plots
      plt.close(figTL)
      plt.close(figHL)
      plt.close(figCorrL)
      plt.close(figModels)
      plt.close(figCorrA)
      plt.close(figTA)
      plt.close(figHA)
      dfC = pd.read_csv(filename)

# Find the best projections - start here if the result files already exist

In [None]:
# Read model fit results for different projection types to a dataframe

rsqLRAr=[]
rsqXGAr=[]
rmseLRAr=[]
rmseXGAr=[]
projTypeAr=[]

for j in range(0, nproj):
  cciProjName = projNameAr[j]

  for k in range(0, nproj):
    s2ProjName = projNameAr[k]

    for l in range(0, nproj):
      allProjName = projNameAr[l]

      projectionType = 'S2' + s2ProjName + 'CCI' + cciProjName + 'All' + allProjName

      f = plotFolder + placeList[0] + projectionType
      resultsFile = f + '_ModelResults.dat'

      modelDF=uc.EDAModelFilestoDF(resultsFile)

      rsqLRAr.append(modelDF.loc['rsq', 'Linear Regression'])
      rsqXGAr.append(modelDF.loc['rsq', 'XGBoost'])
      rmseLRAr.append(modelDF.loc['rmse','Linear Regression'])
      rmseXGAr.append(modelDF.loc['rmse','XGBoost'])
      projTypeAr.append(projectionType)

In [None]:
# make one dataframe with all the results
resultsAr={'projType': projTypeAr,
  'rsqLR' : rsqLRAr,
  'rmseLR' : rmseLRAr,
  'rsqXG' : rsqXGAr,
  'rmseXG' : rmseXGAr
}

resultsDF=pd.DataFrame(resultsAr)

cols = list(resultsDF.columns)
ncol = len(cols)


# sort and print results
for i in range (1, ncol):
  print(cols[i])
  if i%2 == 1:
    print(resultsDF.sort_values(cols[i], ascending=False).head(6), '\n\n')
  else:
    print(resultsDF.sort_values(cols[i], ascending=True).head(6), '\n\n')

rsqLR
                     projType     rsqLR     rmseLR     rsqXG     rmseXG
11       S2S2CCICCIAllgeneral  0.682509  28.748846  0.730198  26.501902
2         S2S2CCIS2Allgeneral  0.676108  28.862888  0.726612  26.517293
14      S2CCICCICCIAllgeneral  0.673766  29.183200  0.733052  26.398632
19       S2S2CCIgeneralAllCCI  0.672896  29.155071  0.722098  26.873051
25  S2generalCCIgeneralAllCCI  0.672766  29.300639  0.730266  26.602098
5        S2CCICCIS2Allgeneral  0.671741  29.155244  0.732729  26.307767 


rmseLR
                     projType     rsqLR     rmseLR     rsqXG     rmseXG
11       S2S2CCICCIAllgeneral  0.682509  28.748846  0.730198  26.501902
2         S2S2CCIS2Allgeneral  0.676108  28.862888  0.726612  26.517293
19       S2S2CCIgeneralAllCCI  0.672896  29.155071  0.722098  26.873051
5        S2CCICCIS2Allgeneral  0.671741  29.155244  0.732729  26.307767
14      S2CCICCICCIAllgeneral  0.673766  29.183200  0.733052  26.398632
25  S2generalCCIgeneralAllCCI  0.672766  29.3006

# Write to file!

In [None]:
# Write the summary dataframe comparing different projection types to a file

# print results dataframe
bestProjFileName = plotFolder + placeList[0] + '_bestProj.dat'
resultsDF.to_csv(bestProjFileName, mode='w')

# print sorted results dataframes
bestProjSortFile = open(plotFolder + placeList[0] + '_bestProjSorted.dat', 'w')

for i in range (1, ncol):
  print(cols[i], file = bestProjSortFile)
  if i%2 == 1:
    print(resultsDF.sort_values(cols[i], ascending=False), '\n\n', file = bestProjSortFile)
  else:
    print(resultsDF.sort_values(cols[i], ascending=True), '\n\n', file = bestProjSortFile)

bestProjSortFile.close()