In [23]:
# mount drive
from google.colab import drive
drive.mount("/drive") 

Mounted at /drive


In [24]:
# imports
import numpy as np
import pandas as pd

In [25]:
# imports for plots
from plotly import graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_white"

In [26]:
# load data
# all weather locations are the same
# choose one location
data = pd.read_csv("/drive/My Drive/Colab Notebooks/AgriEdge/datasets/weather_SidiSLIMAN.csv")

In [27]:
# function to classify crop years

# d as a string
def classify_date(d):
  d = pd.Timestamp(d)
  # wheat calendar
  START_DATE = "-10-15"
  END_DATE = "-07-10"

  prev_year = str(d.year - 1)
  current_year = str(d.year)
  next_year = str(d.year + 1)

  to_date = pd.Timestamp
  if to_date(current_year + START_DATE) <= d <= to_date(next_year + END_DATE):
    return next_year

  if to_date(prev_year + START_DATE) <= d <= to_date(current_year + END_DATE):
    return current_year
  
  return "not classified"

In [28]:
# Add new col "crop year" for wheat calender 
data["crop_year"] = data["DATE"].apply(classify_date)

In [29]:
data

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year
0,1981-01-01,-99.00,0.01,60.26,8.33,0.88,17.79,2.72,1.56,1981
1,1981-01-02,-99.00,0.00,63.23,8.95,1.98,16.00,4.81,1.06,1981
2,1981-01-03,-99.00,0.02,63.74,9.07,2.28,16.37,4.38,1.01,1981
3,1981-01-04,-99.00,0.07,71.77,9.02,3.96,15.47,4.96,0.94,1981
4,1981-01-05,-99.00,8.27,82.58,8.56,5.62,14.20,5.79,0.83,1981
...,...,...,...,...,...,...,...,...,...,...
14636,2021-01-27,13.29,0.03,84.69,15.97,13.40,22.84,11.84,1.10,2021
14637,2021-01-28,13.48,0.03,76.27,16.82,12.67,25.10,11.92,1.17,2021
14638,2021-01-29,11.68,0.07,81.22,15.60,12.41,22.60,11.78,1.38,2021
14639,2021-01-30,6.85,1.14,87.00,14.16,12.02,19.42,10.39,2.94,2021


In [30]:
# remove ****-02-29 from crop years
# select index
arr_date_to_remove = [f"{year}-02-29" for year in range(1981, 2020 + 1)]
index_to_drop = data[data["DATE"].isin(arr_date_to_remove)].index

# drop
data.drop(index_to_drop, inplace=True)

In [31]:
# remove crop years 1981, 2021
# they are not complete
# remove also not classified

# select index
index_to_drop = data[data["crop_year"].isin(["1981", "2021", "not classified"])].index

# drop
data.drop(index_to_drop, inplace=True)

In [32]:
data["crop_year"].value_counts()

2007    269
2000    269
1989    269
2005    269
2003    269
2019    269
1994    269
1984    269
2010    269
2008    269
2016    269
1986    269
2009    269
2006    269
2001    269
2011    269
2017    269
1990    269
1987    269
1988    269
1993    269
1999    269
1998    269
1982    269
2015    269
1985    269
2018    269
1983    269
1992    269
2013    269
1995    269
1991    269
2014    269
2002    269
2004    269
1996    269
2012    269
2020    269
1997    269
Name: crop_year, dtype: int64

In [33]:
# create col of days
# for each crop year

val_counts = data["crop_year"].value_counts()

nb_years = len(val_counts)
nb_days = val_counts[0]

# create a list of days 
# in my case 1 --> nb_days
li_days = [i + 1 for i in range(nb_days)]

# create the complet col to be added 
# data frame
# concatenation of the li of days
complet_li_days = []
for i in range(nb_years):
  complet_li_days += li_days

data["day"] = complet_li_days

In [34]:
# create avg col
data["T2M_AVG"] = (data["T2M_MAX"] + data["T2M_MIN"]) / 2

In [35]:
# create GDD
create_GDD_for_year = lambda crop_year: data[data["crop_year"] == crop_year]["T2M_AVG"].copy().cumsum()

arr_li_cum_tem = []
for crop_year in data["crop_year"].value_counts().sort_index().index:
  arr_li_cum_tem.append(create_GDD_for_year(crop_year))

data["GDD"] = pd.concat(arr_li_cum_tem, ignore_index=True).values

In [36]:
data

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year,day,T2M_AVG,GDD
287,1981-10-15,-99.00,0.20,27.83,25.73,5.07,35.07,19.40,1.93,1982,1,27.235,27.235
288,1981-10-16,-99.00,0.14,46.26,23.85,11.38,30.86,17.82,2.78,1982,2,24.340,51.575
289,1981-10-17,-99.00,0.06,52.67,22.69,12.19,30.74,16.74,1.72,1982,3,23.740,75.315
290,1981-10-18,-99.00,0.28,37.36,25.51,9.55,32.46,21.01,3.36,1982,4,26.735,102.050
291,1981-10-19,-99.00,0.02,28.48,27.13,6.88,35.23,21.87,1.90,1982,5,28.550,130.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14431,2020-07-06,28.56,0.00,23.36,32.98,9.31,43.61,23.06,1.75,2020,265,33.335,4798.835
14432,2020-07-07,28.53,0.00,17.79,35.64,7.50,46.11,25.73,1.62,2020,266,35.920,4834.755
14433,2020-07-08,27.94,0.00,35.64,29.69,12.81,37.05,21.06,3.17,2020,267,29.055,4863.810
14434,2020-07-09,28.70,0.00,46.51,25.46,13.18,35.16,18.38,2.49,2020,268,26.770,4890.580


In [37]:
# create cumulative precipitation
create_cum_prec_for_year = lambda crop_year: data[data["crop_year"] == crop_year]["PRECTOT"].copy().cumsum()

arr_li_cum_prec = []
for crop_year in data["crop_year"].value_counts().sort_index().index:
  arr_li_cum_prec.append(create_cum_prec_for_year(crop_year))

data["cumulative_PREC"] = pd.concat(arr_li_cum_prec, ignore_index=True).values

In [38]:
data

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year,day,T2M_AVG,GDD,cumulative_PREC
287,1981-10-15,-99.00,0.20,27.83,25.73,5.07,35.07,19.40,1.93,1982,1,27.235,27.235,0.20
288,1981-10-16,-99.00,0.14,46.26,23.85,11.38,30.86,17.82,2.78,1982,2,24.340,51.575,0.34
289,1981-10-17,-99.00,0.06,52.67,22.69,12.19,30.74,16.74,1.72,1982,3,23.740,75.315,0.40
290,1981-10-18,-99.00,0.28,37.36,25.51,9.55,32.46,21.01,3.36,1982,4,26.735,102.050,0.68
291,1981-10-19,-99.00,0.02,28.48,27.13,6.88,35.23,21.87,1.90,1982,5,28.550,130.600,0.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14431,2020-07-06,28.56,0.00,23.36,32.98,9.31,43.61,23.06,1.75,2020,265,33.335,4798.835,431.47
14432,2020-07-07,28.53,0.00,17.79,35.64,7.50,46.11,25.73,1.62,2020,266,35.920,4834.755,431.47
14433,2020-07-08,27.94,0.00,35.64,29.69,12.81,37.05,21.06,3.17,2020,267,29.055,4863.810,431.47
14434,2020-07-09,28.70,0.00,46.51,25.46,13.18,35.16,18.38,2.49,2020,268,26.770,4890.580,431.47


In [42]:
# visualize

col = "GDD"

data_to_plot = data[data["crop_year"].isin(["1985", "1990", "1995", "2000", "2005", "2010"])]

fig = px.line(data_to_plot, x="day", y=col, color="crop_year")

fig.show()

In [40]:
# # save dataset

# path = "/drive/My Drive/Colab Notebooks/AgriEdge/datasets/processed_weather.csv"
# data.to_csv(path, index=False)