# User End Pipeline 
### - going in depth of the full preprocessing and data engineering that happens in the backend

### Importing Libraries

In [1]:
import os
from netCDF4 import Dataset
import numpy as np
import pandas as pd 
from datetime import datetime, timedelta
import pickle

### NC Unpacker

In [2]:
def unpack(path, lat_location, lon_location):
    # data = Dataset(path, 'r')
    try:
        data = Dataset(path, 'r')
    except Exception as e:
        print(f"An error occurred while opening the file: {str(e)}")
        return None  # Return None to indicate failure
    variable_name = list(data.variables.keys())[-1]
    # Storing the lat and lon data into the variables 
    lat = data.variables['lat'][:]
    lon = data.variables['lon'][:]


    # Squared difference of lat and lon 
    sq_diff_lat = (lat - lat_location)**2
    sq_diff_lon = (lon - lon_location)**2

    # Identifying the index of the minimum value for lat and lon 
    min_index_lat = sq_diff_lat.argmin()
    min_index_lon = sq_diff_lon.argmin()

    feature = data.variables[variable_name]

    days = data.variables['day']
    start_date = datetime(1900, 1, 1)  # Start date in the 1900 system
    dates = [start_date + timedelta(days=int(day)) for day in days]


    df = pd.DataFrame(columns=['Date', variable_name])
    df['Date'] = dates
  

    dt = np.arange(0, data.variables['day'].size)
    for time_index in dt:
        # Use numpy.ma.getdata to get unmasked values
        feature_values = feature[time_index, min_index_lat, min_index_lon]
        
        # Now, you can assign the unmasked values to the 'Temperature' column
        df.at[time_index, variable_name] = feature_values

    return df

### Reading the  coordinates for the desired State

In [3]:
states = pd.read_csv("/Users/ananyagiliyal/Desktop/tester/state_coordinates.csv")
states

Unnamed: 0,ID,state,Latitude,Longitude
0,0,Illinois,40.6331,-89.3985
1,1,Indiana,40.2672,-86.1349
2,2,Iowa,41.878,-93.0977
3,3,Kansas,38.5266,-96.7265
4,4,Minnesota,46.7296,-94.6859
5,5,Missouri,38.5739,-92.6032
6,6,Nebraska,41.4925,-99.9018
7,7,Ohio,40.4173,-82.9071
8,8,South Dakota,44.3683,-100.3509
9,9,Wisconsin,43.7844,-88.7879


In [4]:
states.state.to_list()

['Illinois',
 'Indiana',
 'Iowa',
 'Kansas',
 'Minnesota',
 'Missouri',
 'Nebraska',
 'Ohio',
 'South Dakota',
 'Wisconsin',
 'North Dakota',
 'Michigan',
 'Kentucky']

In [5]:
user_state = input("Enter the state: ")

# Search for the row where the 'state' column matches the user input
result = states[states['state'] == user_state]

# Check if a match was found
if not result.empty:
    # Extract the 'lat' and 'lon' values from the matching row
    lat = result['Latitude'].values[0]
    lon = result['Longitude'].values[0]
    print(f"Latitude: {lat}, Longitude: {lon}")
else:
    print("State not found in the DataFrame.")

Latitude: 40.6331, Longitude: -89.3985


In [6]:
from functools import reduce
import os


In [7]:
max_temp = r"/Users/ananyagiliyal/Desktop/tester/tester_files/tmmx_2020.nc"
min_temp = r"/Users/ananyagiliyal/Desktop/tester/tester_files/tmmn_2020.nc"
precipitation = r"/Users/ananyagiliyal/Desktop/tester/tester_files/pr_2020.nc"
max_humidity = r"/Users/ananyagiliyal/Desktop/tester/tester_files/rmax_2020.nc"
min_humidity = r"/Users/ananyagiliyal/Desktop/tester/tester_files/rmin_2020.nc"
near_surface_specific_humidity = r"/Users/ananyagiliyal/Desktop/tester/tester_files/sph_2020.nc"
vapor_pressure_deficit = r"/Users/ananyagiliyal/Desktop/tester/tester_files/vpd_2020.nc"
solar_radiation = r"/Users/ananyagiliyal/Desktop/tester/tester_files/srad_2020.nc"
wind_speed = r"/Users/ananyagiliyal/Desktop/tester/tester_files/vs_2020.nc"

### Creating a dictionary with our file paths

In [8]:
path_dict = {
    "max_temp":max_temp,
    "min_temp":min_temp,
    "precipitation":precipitation,
    "max_humidity":max_humidity,
    "min_humidity":min_humidity,
    "near_surface_specific_humidity":near_surface_specific_humidity,
    "vapor_pressure_deficit":vapor_pressure_deficit,
    "solar_radiation":solar_radiation,
    "wind_speed":wind_speed
}

### Unpack and Concatenate to get daily weather dataframe

In [9]:
dfs =[]

for variable,filepath in path_dict.items():
    df = unpack(filepath,lat,lon)
    df.rename(columns={df.columns[1]: variable}, inplace=True)
    dfs.append(df)
def merge_dataframes(df1, df2):
    return pd.merge(df1, df2, on="Date", how="outer") 
combined_df = reduce(merge_dataframes, dfs)

In [10]:
combined_df

Unnamed: 0,Date,max_temp,min_temp,precipitation,max_humidity,min_humidity,near_surface_specific_humidity,vapor_pressure_deficit,solar_radiation,wind_speed
0,2020-01-01,281.9,268.8,0.0,95.9,39.7,0.0029,0.33,101.2,6.6
1,2020-01-02,283.1,273.8,0.0,97.5,54.9,0.00425,0.26,94.2,5.4
2,2020-01-03,276.2,272.9,2.5,93.0,81.0,0.0038,0.08,74.4,2.5
3,2020-01-04,274.5,267.7,0.0,100.0,75.6,0.00315,0.04,56.0,4.3
4,2020-01-05,282.4,269.3,0.0,91.8,37.9,0.00288,0.36,91.6,6.4
...,...,...,...,...,...,...,...,...,...,...
361,2020-12-27,283.8,269.4,0.0,100.0,34.2,0.00312,0.38,84.2,6.1
362,2020-12-28,272.4,263.7,0.0,100.0,57.4,0.00219,0.09,72.6,4.4
363,2020-12-29,276.8,266.3,11.4,87.2,37.9,0.0021,0.25,44.9,5.3
364,2020-12-30,276.2,267.0,0.0,100.0,61.1,0.00305,0.09,79.1,6.1


In [11]:
combined_df.columns

Index(['Date', 'max_temp', 'min_temp', 'precipitation', 'max_humidity',
       'min_humidity', 'near_surface_specific_humidity',
       'vapor_pressure_deficit', 'solar_radiation', 'wind_speed'],
      dtype='object')

In [12]:
combined_df.Date.info()

<class 'pandas.core.series.Series'>
Int64Index: 366 entries, 0 to 365
Series name: Date
Non-Null Count  Dtype         
--------------  -----         
366 non-null    datetime64[ns]
dtypes: datetime64[ns](1)
memory usage: 5.7 KB


### Transforming our daily data into weekly feature columns

In [13]:
combined_df['Year'] = combined_df['Date'].dt.year
combined_df['Week'] = (combined_df['Date'].dt.strftime('%W').astype(int) + 1).astype(str)
df_grouped =combined_df.groupby(['Year', 'Week']).agg({'min_humidity': 'mean', 'max_humidity': 'mean','min_temp': 'mean', 'max_temp': 'mean','vapor_pressure_deficit': 'mean', 'near_surface_specific_humidity': 'mean','precipitation': 'mean', 'solar_radiation': 'mean','wind_speed':'mean'}).reset_index()

pivot_table = df_grouped.pivot_table(index=['Year'], columns='Week', values=['min_humidity', 'max_humidity','min_temp','max_temp','vapor_pressure_deficit','near_surface_specific_humidity','precipitation','solar_radiation','wind_speed']).reset_index()
pivot_table.columns = [' '.join(col).strip() for col in pivot_table.columns.values]


In [14]:
pivot_table

Unnamed: 0,Year,max_humidity 1,max_humidity 10,max_humidity 11,max_humidity 12,max_humidity 13,max_humidity 14,max_humidity 15,max_humidity 16,max_humidity 17,...,wind_speed 49,wind_speed 5,wind_speed 50,wind_speed 51,wind_speed 52,wind_speed 53,wind_speed 6,wind_speed 7,wind_speed 8,wind_speed 9
0,2020,95.64,81.471429,87.514286,88.2,99.0,86.542857,86.6,79.342857,83.342857,...,3.657143,2.942857,3.1,3.914286,6.571429,4.575,4.785714,4.185714,4.785714,5.671429


In [15]:
pivot_table['max_humidity 1']

0    95.64
Name: max_humidity 1, dtype: float64

In [16]:
variables = ['min_humidity', 'max_humidity', 'min_temp', 'max_temp', 'vapor_pressure_deficit', 
             'near_surface_specific_humidity', 'precipitation', 'solar_radiation', 'wind_speed']

columns_to_drop = []

for var in variables:
    columns_to_drop.extend(['{} {}'.format(var, i) for i in range(1, 22)])
    columns_to_drop.append(var+' 53')
# Drop the columns from the dataframe
df_corn = pivot_table.drop(columns=columns_to_drop)
df_corn


Unnamed: 0,Year,max_humidity 22,max_humidity 23,max_humidity 24,max_humidity 25,max_humidity 26,max_humidity 27,max_humidity 28,max_humidity 29,max_humidity 30,...,wind_speed 43,wind_speed 44,wind_speed 45,wind_speed 46,wind_speed 47,wind_speed 48,wind_speed 49,wind_speed 50,wind_speed 51,wind_speed 52
0,2020,86.042857,85.171429,83.4,84.7,87.357143,94.028571,96.371429,92.985714,92.414286,...,4.7,4.542857,5.171429,6.285714,5.528571,4.028571,3.657143,3.1,3.914286,6.571429


In [17]:
features = df_corn.drop(columns=['Year'],axis=1)
features

Unnamed: 0,max_humidity 22,max_humidity 23,max_humidity 24,max_humidity 25,max_humidity 26,max_humidity 27,max_humidity 28,max_humidity 29,max_humidity 30,max_humidity 31,...,wind_speed 43,wind_speed 44,wind_speed 45,wind_speed 46,wind_speed 47,wind_speed 48,wind_speed 49,wind_speed 50,wind_speed 51,wind_speed 52
0,86.042857,85.171429,83.4,84.7,87.357143,94.028571,96.371429,92.985714,92.414286,93.385714,...,4.7,4.542857,5.171429,6.285714,5.528571,4.028571,3.657143,3.1,3.914286,6.571429


### Now the data is ready to be sent as input to the model !

In [20]:
import joblib

In [21]:
model = joblib.load('/Users/ananyagiliyal/Desktop/tester/Corn_Models/CatBoostRegressorCorn_best_model.joblib')


In [22]:
weather_data = features.values

In [23]:
weather_data.shape

(1, 279)

In [24]:
predicted_yield = model.predict(weather_data)


In [25]:
predicted_yield
print(f"Predicted corn yield: {predicted_yield[0]}")

Predicted corn yield: 190.56151023039934
