In [2]:
# mount drive
from google.colab import drive
drive.mount("/drive") 

Mounted at /drive


In [3]:
# imports
import numpy as np
import pandas as pd

In [4]:
# imports for plots
from plotly import graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.templates.default = "plotly_white"

In [5]:
# load data
# all weather locations are the same
# choose one location
data = pd.read_csv("/drive/My Drive/Colab Notebooks/AgriEdge/weather_dataset/34.269355;-5.938411.csv")

In [6]:
data

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M
0,1981-01-01,-99.00,0.01,60.26,8.33,0.88,17.79,2.72,1.56
1,1981-01-02,-99.00,0.00,63.23,8.95,1.98,16.00,4.81,1.06
2,1981-01-03,-99.00,0.02,63.74,9.07,2.28,16.37,4.38,1.01
3,1981-01-04,-99.00,0.07,71.77,9.02,3.96,15.47,4.96,0.94
4,1981-01-05,-99.00,8.27,82.58,8.56,5.62,14.20,5.79,0.83
...,...,...,...,...,...,...,...,...,...
14605,2020-12-27,11.50,0.12,71.61,11.77,6.86,19.50,6.95,1.85
14606,2020-12-28,7.39,15.52,82.65,11.16,8.32,15.01,8.38,3.78
14607,2020-12-29,5.37,70.87,86.82,11.39,9.26,13.33,9.94,2.36
14608,2020-12-30,11.45,6.48,72.77,9.91,5.29,15.26,5.96,1.52


In [7]:
# function to classify crop years

# d as a string
def classify_date(d):
  d = pd.Timestamp(d)
  # wheat calendar
  START_DATE = "-10-15"
  END_DATE = "-07-10"

  prev_year = str(d.year - 1)
  current_year = str(d.year)
  next_year = str(d.year + 1)

  to_date = pd.Timestamp
  if to_date(current_year + START_DATE) <= d <= to_date(next_year + END_DATE):
    return next_year

  if to_date(prev_year + START_DATE) <= d <= to_date(current_year + END_DATE):
    return current_year
  
  return "not classified"

In [8]:
# test
da = pd.Timestamp("1981-08-14")

classify_date(da)

'not classified'

In [9]:
# create new col crop year

data["crop_year"] = data["DATE"].apply(classify_date)

In [10]:
data

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year
0,1981-01-01,-99.00,0.01,60.26,8.33,0.88,17.79,2.72,1.56,1981
1,1981-01-02,-99.00,0.00,63.23,8.95,1.98,16.00,4.81,1.06,1981
2,1981-01-03,-99.00,0.02,63.74,9.07,2.28,16.37,4.38,1.01,1981
3,1981-01-04,-99.00,0.07,71.77,9.02,3.96,15.47,4.96,0.94,1981
4,1981-01-05,-99.00,8.27,82.58,8.56,5.62,14.20,5.79,0.83,1981
...,...,...,...,...,...,...,...,...,...,...
14605,2020-12-27,11.50,0.12,71.61,11.77,6.86,19.50,6.95,1.85,2021
14606,2020-12-28,7.39,15.52,82.65,11.16,8.32,15.01,8.38,3.78,2021
14607,2020-12-29,5.37,70.87,86.82,11.39,9.26,13.33,9.94,2.36,2021
14608,2020-12-30,11.45,6.48,72.77,9.91,5.29,15.26,5.96,1.52,2021


In [11]:
# # add GDD := growing degrees days
# # cumulative temperature

# # create avr temp col then cumul
# data["T2M_AVG"] = (data["T2M_MAX"] + data["T2M_MIN"]) / 2
# data["GDD"] = data["T2M_AVG"].cumsum()

In [12]:
# data

In [13]:
# create col for month and da

In [14]:
# construct dict by crop year

dict_crop_years = {}

for year in data["crop_year"].value_counts().sort_index().index:
  dict_crop_years[year] = data[data["crop_year"] == year].copy()



In [15]:
# make plots

arr_years = ["1990", "1995", "2000"]

data_to_plot = data[data["crop_year"].isin(arr_years)]

fig = px.line(data_to_plot, x="DATE", y="T2M_MAX", color="crop_year")

fig.show()

In [16]:
data[data["crop_year"] == "1999"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 269 entries, 6496 to 6764
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DATE               269 non-null    object 
 1   ALLSKY_SFC_SW_DWN  269 non-null    float64
 2   PRECTOT            269 non-null    float64
 3   RH2M               269 non-null    float64
 4   T2M                269 non-null    float64
 5   T2MDEW             269 non-null    float64
 6   T2M_MAX            269 non-null    float64
 7   T2M_MIN            269 non-null    float64
 8   WS2M               269 non-null    float64
 9   crop_year          269 non-null    object 
dtypes: float64(8), object(2)
memory usage: 23.1+ KB


In [17]:
data[data["crop_year"] == "2000"].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270 entries, 6861 to 7130
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DATE               270 non-null    object 
 1   ALLSKY_SFC_SW_DWN  270 non-null    float64
 2   PRECTOT            270 non-null    float64
 3   RH2M               270 non-null    float64
 4   T2M                270 non-null    float64
 5   T2MDEW             270 non-null    float64
 6   T2M_MAX            270 non-null    float64
 7   T2M_MIN            270 non-null    float64
 8   WS2M               270 non-null    float64
 9   crop_year          270 non-null    object 
dtypes: float64(8), object(2)
memory usage: 23.2+ KB


In [18]:
data["crop_year"].value_counts()

not classified    3840
1992               270
1984               270
1988               270
2008               270
2016               270
1996               270
2012               270
2004               270
2000               270
2020               270
1986               269
2007               269
2006               269
1989               269
1987               269
1997               269
1991               269
2011               269
1985               269
2013               269
1995               269
2014               269
1982               269
2009               269
2015               269
1983               269
2003               269
2017               269
2019               269
2018               269
2005               269
2001               269
1994               269
2010               269
1993               269
1990               269
1999               269
2002               269
1998               269
1981               191
2021                78
Name: crop_year, dtype: int64

In [19]:
# remove ****-02-29 from crop years
# select index
arr_date_to_remove = [f"{year}-02-29" for year in range(1981, 2020 + 1)]
index_to_drop = data[data["DATE"].isin(arr_date_to_remove)].index

# drop
data.drop(index_to_drop, inplace=True)

In [20]:
data["crop_year"].value_counts()

not classified    3840
1986               269
2020               269
1991               269
2014               269
1988               269
2007               269
2006               269
1989               269
1992               269
1997               269
2015               269
1987               269
2011               269
1985               269
2013               269
2016               269
1995               269
1984               269
2009               269
1982               269
2002               269
2001               269
1983               269
2003               269
2017               269
2008               269
2019               269
2018               269
1996               269
2005               269
1994               269
2004               269
2010               269
1993               269
1990               269
2000               269
2012               269
1999               269
1998               269
1981               191
2021                78
Name: crop_year, dtype: int64

In [21]:
# remove crop years 1981, 2021
# they are not complete
# remove also not classified

# select index
index_to_drop = data[data["crop_year"].isin(["1981", "2021", "not classified"])].index

# drop
data.drop(index_to_drop, inplace=True)

In [22]:
data["crop_year"].value_counts()

1986    269
2020    269
1991    269
2014    269
1988    269
2007    269
2006    269
1989    269
1992    269
1997    269
1982    269
1987    269
2011    269
1985    269
2013    269
2016    269
1995    269
1984    269
2009    269
2002    269
2015    269
2005    269
1983    269
2003    269
2017    269
2008    269
2019    269
2018    269
1996    269
2001    269
1999    269
1994    269
2004    269
2010    269
1993    269
1990    269
2000    269
2012    269
1998    269
Name: crop_year, dtype: int64

In [23]:
# create col of days
# for each crop year

val_counts = data["crop_year"].value_counts()

nb_years = len(val_counts)
nb_days = val_counts[0]

# create a list of days 
# in my case 1 --> nb_days
li_days = [i + 1 for i in range(nb_days)]

# create the complet col to be added 
# data frame
# concatenation of the li of days
complet_li_days = []
for i in range(nb_years):
  complet_li_days += li_days


In [24]:
# add col to data frame

data["day"] = complet_li_days

In [25]:
data[data["crop_year"] == "2000"]

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year,day
6861,1999-10-15,5.47,4.82,77.75,20.47,16.43,25.33,17.40,1.06,2000,1
6862,1999-10-16,14.00,8.58,70.96,21.63,16.11,27.64,17.57,2.07,2000,2
6863,1999-10-17,10.22,1.01,74.75,18.62,14.01,23.85,14.41,2.67,2000,3
6864,1999-10-18,9.36,0.97,74.38,17.90,13.27,23.55,13.54,2.48,2000,4
6865,1999-10-19,10.69,0.78,72.92,18.68,13.67,24.65,14.55,2.03,2000,5
...,...,...,...,...,...,...,...,...,...,...,...
7126,2000-07-06,30.67,0.00,35.25,29.37,11.53,40.45,19.72,1.95,2000,265
7127,2000-07-07,25.96,0.00,36.14,30.19,12.96,39.91,22.40,2.75,2000,266
7128,2000-07-08,26.21,0.00,43.33,27.58,13.70,35.70,21.38,3.18,2000,267
7129,2000-07-09,29.41,0.00,48.05,25.58,13.54,34.70,18.66,2.66,2000,268


In [38]:
# create avg col
data["T2M_AVG"] = (data["T2M_MAX"] + data["T2M_MIN"]) / 2


In [66]:
# create GDD
create_GDD_for_year = lambda crop_year: data[data["crop_year"] == crop_year]["T2M_AVG"].copy().cumsum()

arr_li_cum_tem = []
for crop_year in data["crop_year"].value_counts().sort_index().index:
  arr_li_cum_tem.append(create_GDD_for_year(crop_year))

data["GDD"] = pd.concat(arr_li_cum_tem, ignore_index=True).values

In [67]:
data

Unnamed: 0,DATE,ALLSKY_SFC_SW_DWN,PRECTOT,RH2M,T2M,T2MDEW,T2M_MAX,T2M_MIN,WS2M,crop_year,day,T2M_AVG,GDD
287,1981-10-15,-99.00,0.20,27.83,25.73,5.07,35.07,19.40,1.93,1982,1,27.235,27.235
288,1981-10-16,-99.00,0.14,46.26,23.85,11.38,30.86,17.82,2.78,1982,2,24.340,51.575
289,1981-10-17,-99.00,0.06,52.67,22.69,12.19,30.74,16.74,1.72,1982,3,23.740,75.315
290,1981-10-18,-99.00,0.28,37.36,25.51,9.55,32.46,21.01,3.36,1982,4,26.735,102.050
291,1981-10-19,-99.00,0.02,28.48,27.13,6.88,35.23,21.87,1.90,1982,5,28.550,130.600
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14431,2020-07-06,28.56,0.00,23.36,32.98,9.31,43.61,23.06,1.75,2020,265,33.335,4798.835
14432,2020-07-07,28.53,0.00,17.79,35.64,7.50,46.11,25.73,1.62,2020,266,35.920,4834.755
14433,2020-07-08,27.94,0.00,35.64,29.69,12.81,37.05,21.06,3.17,2020,267,29.055,4863.810
14434,2020-07-09,28.70,0.00,46.51,25.46,13.18,35.16,18.38,2.49,2020,268,26.770,4890.580


In [77]:
# visualize

col = "GDD"

data_to_plot = data[data["crop_year"].isin(["1985", "1990", "1995", "2000", "2005", "2010"])]

fig = px.line(data_to_plot, x="day", y=col, color="crop_year")

fig.show()