In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('./data/dailyclimate.csv')

In [3]:
df = df.drop('Unnamed: 0', axis=1)

In [4]:
df.head()

Unnamed: 0,Date,District,Latitude,Longitude,Precip,Pressure,Humidity_2m,RH_2m,Temp_2m,WetBulbTemp_2m,...,TempRange_2m,EarthSkinTemp,WindSpeed_10m,MaxWindSpeed_10m,MinWindSpeed_10m,WindSpeedRange_10m,WindSpeed_50m,MaxWindSpeed_50m,MinWindSpeed_50m,WindSpeedRange_50m
0,1981-01-01,Arghakhanchi,27.9,83.2,0.0,93.51,4.81,45.41,13.89,2.15,...,10.89,11.32,1.89,3.83,0.69,3.14,2.41,4.12,0.73,3.39
1,1981-01-02,Arghakhanchi,27.9,83.2,0.0,93.59,4.94,46.78,13.84,2.54,...,11.17,11.44,1.72,2.6,1.09,1.5,2.25,3.3,0.96,2.34
2,1981-01-03,Arghakhanchi,27.9,83.2,0.03,93.55,5.22,47.91,14.33,3.32,...,9.93,12.24,1.8,2.8,0.48,2.32,2.32,3.54,0.39,3.15
3,1981-01-04,Arghakhanchi,27.9,83.2,0.02,93.49,5.36,50.83,13.82,3.73,...,10.41,12.17,2.18,3.54,1.06,2.49,2.9,4.05,0.93,3.12
4,1981-01-05,Arghakhanchi,27.9,83.2,1.84,93.49,5.84,55.55,13.76,4.93,...,10.53,12.32,1.96,2.7,0.69,2.02,2.74,4.64,0.96,3.68


In [5]:
df['year'] = pd.to_datetime(df['Date']).dt.year
df['month'] = pd.to_datetime(df['Date']).dt.month
df['day'] = pd.to_datetime(df['Date']).dt.day

# convert int_type data into float_type
df['year'] = df['year'].astype(float) 
df['month'] = df['month'].astype(float) 
df['day'] = df['day'].astype(float)

In [6]:
df['Date'] = pd.to_datetime(df['Date'])
df.set_index('Date', inplace=True)

In [7]:
encoder = LabelEncoder()
df['Location'] = encoder.fit_transform(df['District'])

In [8]:
place = df[['District', 'year', 'month', 'day']]
place

Unnamed: 0_level_0,District,year,month,day
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1981-01-01,Arghakhanchi,1981.0,1.0,1.0
1981-01-02,Arghakhanchi,1981.0,1.0,2.0
1981-01-03,Arghakhanchi,1981.0,1.0,3.0
1981-01-04,Arghakhanchi,1981.0,1.0,4.0
1981-01-05,Arghakhanchi,1981.0,1.0,5.0
...,...,...,...,...
2019-12-27,Udayapur,2019.0,12.0,27.0
2019-12-28,Udayapur,2019.0,12.0,28.0
2019-12-29,Udayapur,2019.0,12.0,29.0
2019-12-30,Udayapur,2019.0,12.0,30.0


In [9]:
df = df.drop('District', axis=1)
df

Unnamed: 0_level_0,Latitude,Longitude,Precip,Pressure,Humidity_2m,RH_2m,Temp_2m,WetBulbTemp_2m,MaxTemp_2m,MinTemp_2m,...,MinWindSpeed_10m,WindSpeedRange_10m,WindSpeed_50m,MaxWindSpeed_50m,MinWindSpeed_50m,WindSpeedRange_50m,year,month,day,Location
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-01-01,27.9,83.2,0.00,93.51,4.81,45.41,13.89,2.15,20.82,9.94,...,0.69,3.14,2.41,4.12,0.73,3.39,1981.0,1.0,1.0,0
1981-01-02,27.9,83.2,0.00,93.59,4.94,46.78,13.84,2.54,20.70,9.54,...,1.09,1.50,2.25,3.30,0.96,2.34,1981.0,1.0,2.0,0
1981-01-03,27.9,83.2,0.03,93.55,5.22,47.91,14.33,3.32,20.71,10.78,...,0.48,2.32,2.32,3.54,0.39,3.15,1981.0,1.0,3.0,0
1981-01-04,27.9,83.2,0.02,93.49,5.36,50.83,13.82,3.73,20.43,10.02,...,1.06,2.49,2.90,4.05,0.93,3.12,1981.0,1.0,4.0,0
1981-01-05,27.9,83.2,1.84,93.49,5.84,55.55,13.76,4.93,19.62,9.08,...,0.69,2.02,2.74,4.64,0.96,3.68,1981.0,1.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,26.9,86.5,0.00,95.52,4.72,54.60,11.13,2.17,17.98,6.97,...,1.25,2.94,3.40,4.57,1.37,3.20,2019.0,12.0,27.0,61
2019-12-28,26.9,86.5,0.00,95.73,4.23,50.90,10.56,0.60,18.52,5.34,...,1.07,1.45,3.05,4.51,1.44,3.07,2019.0,12.0,28.0,61
2019-12-29,26.9,86.5,0.00,95.76,4.08,46.09,11.51,0.18,19.34,7.04,...,0.85,2.17,2.97,4.12,0.94,3.18,2019.0,12.0,29.0,61
2019-12-30,26.9,86.5,0.00,95.84,4.44,47.07,12.47,1.32,20.03,7.83,...,0.27,1.67,2.00,3.22,0.15,3.06,2019.0,12.0,30.0,61


In [10]:
got_para = ['Longitude', 'Latitude', 'Temp_2m', 'MaxTemp_2m', 'MinTemp_2m', 'Pressure', 'Humidity_2m', 'WindSpeed_10m']


In [11]:
df['pressure'] = df['Pressure']
df = df.assign()
df.loc[:,'Pressure'] = df['pressure']*100
df = df.drop('pressure', axis=1)
df

Unnamed: 0_level_0,Latitude,Longitude,Precip,Pressure,Humidity_2m,RH_2m,Temp_2m,WetBulbTemp_2m,MaxTemp_2m,MinTemp_2m,...,MinWindSpeed_10m,WindSpeedRange_10m,WindSpeed_50m,MaxWindSpeed_50m,MinWindSpeed_50m,WindSpeedRange_50m,year,month,day,Location
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-01-01,27.9,83.2,0.00,9351.0,4.81,45.41,13.89,2.15,20.82,9.94,...,0.69,3.14,2.41,4.12,0.73,3.39,1981.0,1.0,1.0,0
1981-01-02,27.9,83.2,0.00,9359.0,4.94,46.78,13.84,2.54,20.70,9.54,...,1.09,1.50,2.25,3.30,0.96,2.34,1981.0,1.0,2.0,0
1981-01-03,27.9,83.2,0.03,9355.0,5.22,47.91,14.33,3.32,20.71,10.78,...,0.48,2.32,2.32,3.54,0.39,3.15,1981.0,1.0,3.0,0
1981-01-04,27.9,83.2,0.02,9349.0,5.36,50.83,13.82,3.73,20.43,10.02,...,1.06,2.49,2.90,4.05,0.93,3.12,1981.0,1.0,4.0,0
1981-01-05,27.9,83.2,1.84,9349.0,5.84,55.55,13.76,4.93,19.62,9.08,...,0.69,2.02,2.74,4.64,0.96,3.68,1981.0,1.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,26.9,86.5,0.00,9552.0,4.72,54.60,11.13,2.17,17.98,6.97,...,1.25,2.94,3.40,4.57,1.37,3.20,2019.0,12.0,27.0,61
2019-12-28,26.9,86.5,0.00,9573.0,4.23,50.90,10.56,0.60,18.52,5.34,...,1.07,1.45,3.05,4.51,1.44,3.07,2019.0,12.0,28.0,61
2019-12-29,26.9,86.5,0.00,9576.0,4.08,46.09,11.51,0.18,19.34,7.04,...,0.85,2.17,2.97,4.12,0.94,3.18,2019.0,12.0,29.0,61
2019-12-30,26.9,86.5,0.00,9584.0,4.44,47.07,12.47,1.32,20.03,7.83,...,0.27,1.67,2.00,3.22,0.15,3.06,2019.0,12.0,30.0,61


In [12]:
use_df = df[got_para]
use_df

Unnamed: 0_level_0,Longitude,Latitude,Temp_2m,MaxTemp_2m,MinTemp_2m,Pressure,Humidity_2m,WindSpeed_10m
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1981-01-01,83.2,27.9,13.89,20.82,9.94,9351.0,4.81,1.89
1981-01-02,83.2,27.9,13.84,20.70,9.54,9359.0,4.94,1.72
1981-01-03,83.2,27.9,14.33,20.71,10.78,9355.0,5.22,1.80
1981-01-04,83.2,27.9,13.82,20.43,10.02,9349.0,5.36,2.18
1981-01-05,83.2,27.9,13.76,19.62,9.08,9349.0,5.84,1.96
...,...,...,...,...,...,...,...,...
2019-12-27,86.5,26.9,11.13,17.98,6.97,9552.0,4.72,2.38
2019-12-28,86.5,26.9,10.56,18.52,5.34,9573.0,4.23,2.03
2019-12-29,86.5,26.9,11.51,19.34,7.04,9576.0,4.08,2.01
2019-12-30,86.5,26.9,12.47,20.03,7.83,9584.0,4.44,1.44


In [13]:
use_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 883128 entries, 1981-01-01 to 2019-12-31
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Longitude      883128 non-null  float64
 1   Latitude       883128 non-null  float64
 2   Temp_2m        883128 non-null  float64
 3   MaxTemp_2m     883128 non-null  float64
 4   MinTemp_2m     883128 non-null  float64
 5   Pressure       883128 non-null  float64
 6   Humidity_2m    883128 non-null  float64
 7   WindSpeed_10m  883128 non-null  float64
dtypes: float64(8)
memory usage: 60.6 MB


In [14]:
df['pressure'] = df['Pressure']
df = df.assign()
df.loc[:,'Pressure'] = df['pressure']*100
df = df.drop('pressure', axis=1)
df

Unnamed: 0_level_0,Latitude,Longitude,Precip,Pressure,Humidity_2m,RH_2m,Temp_2m,WetBulbTemp_2m,MaxTemp_2m,MinTemp_2m,...,MinWindSpeed_10m,WindSpeedRange_10m,WindSpeed_50m,MaxWindSpeed_50m,MinWindSpeed_50m,WindSpeedRange_50m,year,month,day,Location
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-01-01,27.9,83.2,0.00,935100.0,4.81,45.41,13.89,2.15,20.82,9.94,...,0.69,3.14,2.41,4.12,0.73,3.39,1981.0,1.0,1.0,0
1981-01-02,27.9,83.2,0.00,935900.0,4.94,46.78,13.84,2.54,20.70,9.54,...,1.09,1.50,2.25,3.30,0.96,2.34,1981.0,1.0,2.0,0
1981-01-03,27.9,83.2,0.03,935500.0,5.22,47.91,14.33,3.32,20.71,10.78,...,0.48,2.32,2.32,3.54,0.39,3.15,1981.0,1.0,3.0,0
1981-01-04,27.9,83.2,0.02,934900.0,5.36,50.83,13.82,3.73,20.43,10.02,...,1.06,2.49,2.90,4.05,0.93,3.12,1981.0,1.0,4.0,0
1981-01-05,27.9,83.2,1.84,934900.0,5.84,55.55,13.76,4.93,19.62,9.08,...,0.69,2.02,2.74,4.64,0.96,3.68,1981.0,1.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-27,26.9,86.5,0.00,955200.0,4.72,54.60,11.13,2.17,17.98,6.97,...,1.25,2.94,3.40,4.57,1.37,3.20,2019.0,12.0,27.0,61
2019-12-28,26.9,86.5,0.00,957300.0,4.23,50.90,10.56,0.60,18.52,5.34,...,1.07,1.45,3.05,4.51,1.44,3.07,2019.0,12.0,28.0,61
2019-12-29,26.9,86.5,0.00,957600.0,4.08,46.09,11.51,0.18,19.34,7.04,...,0.85,2.17,2.97,4.12,0.94,3.18,2019.0,12.0,29.0,61
2019-12-30,26.9,86.5,0.00,958400.0,4.44,47.07,12.47,1.32,20.03,7.83,...,0.27,1.67,2.00,3.22,0.15,3.06,2019.0,12.0,30.0,61


In [15]:
# splitting data in 2 parts using array.split it uses location to split
# cannot split datetime into two samples so resetting index and spliting the table
# use_1, use_2 = np.array_split(use_df, 2)

useful_df = use_df.reset_index()
use_1 = useful_df.loc[:441563,:]
use_2 = useful_df.loc[441564:,:]

use_1.set_index('Date', inplace=True)
use_2.set_index('Date', inplace=True)

In [16]:
use_1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 441564 entries, 1981-01-01 to 2019-12-31
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Longitude      441564 non-null  float64
 1   Latitude       441564 non-null  float64
 2   Temp_2m        441564 non-null  float64
 3   MaxTemp_2m     441564 non-null  float64
 4   MinTemp_2m     441564 non-null  float64
 5   Pressure       441564 non-null  float64
 6   Humidity_2m    441564 non-null  float64
 7   WindSpeed_10m  441564 non-null  float64
dtypes: float64(8)
memory usage: 30.3 MB


In [17]:
X = use_1.drop(['Temp_2m'], axis=1)
y = use_1['Temp_2m']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Initialize Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse_1 = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse_1}")

Mean Squared Error: 0.07076477848119754


In [19]:
mae_1 = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae_1}")
r2 = r2_score(y_test, y_pred)
print(f"R-squared Score: {r2}")

Mean Absolute Error: 0.19231054431397415
R-squared Score: 0.9990842827223281


In [20]:
X2 = use_2.drop(['Temp_2m'], axis=1)
y2 = use_2['Temp_2m']
# Split into training and testing sets
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Initialize Random Forest Regressor
model_2 = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model_2.fit(X2_train, y2_train)

# Make predictions
y2_pred = model_2.predict(X2_test)

# Evaluate the model
mse_2 = mean_squared_error(y2_test, y2_pred)
print(f"Mean Squared Error: {mse_2}")

Mean Squared Error: 0.06198272219922323


In [21]:
mae_2 = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae_2}")
r2_2 = r2_score(y_test, y_pred)
print(f"R-squared Score: {r2_2}")

Mean Absolute Error: 0.19231054431397415
R-squared Score: 0.9990842827223281


In [22]:
useful_pred = use_2.reset_index()
compare = useful_pred.drop(['Date', 'Longitude', 'Latitude', 'MaxTemp_2m', 'MinTemp_2m', 'Pressure', 'Humidity_2m', 'WindSpeed_10m'], axis=1)
use2_pred = useful_pred.drop(['Temp_2m','Date'], axis=1)
use2_pred

Unnamed: 0,Longitude,Latitude,MaxTemp_2m,MinTemp_2m,Pressure,Humidity_2m,WindSpeed_10m
0,84.4,28.3,12.15,0.25,7759.0,3.47,1.76
1,84.4,28.3,11.10,1.71,7765.0,3.44,1.65
2,84.4,28.3,9.79,0.76,7760.0,3.87,1.63
3,84.4,28.3,8.74,-0.41,7749.0,3.95,1.81
4,84.4,28.3,7.51,-0.06,7748.0,4.13,1.65
...,...,...,...,...,...,...,...
441559,86.5,26.9,17.98,6.97,9552.0,4.72,2.38
441560,86.5,26.9,18.52,5.34,9573.0,4.23,2.03
441561,86.5,26.9,19.34,7.04,9576.0,4.08,2.01
441562,86.5,26.9,20.03,7.83,9584.0,4.44,1.44


In [23]:
compare.rename(columns={'Temp_2m':'Actual'}, inplace=True)
compare

Unnamed: 0,Actual
0,5.11
1,5.15
2,4.55
3,3.06
4,2.87
...,...
441559,11.13
441560,10.56
441561,11.51
441562,12.47


In [24]:
temp1 = model.predict(pd.DataFrame(use2_pred.loc[2]).T)
print(temp1)

[4.2763]


In [25]:
import random
no = []
for i in range(0,25):
    no.append(random.randrange(0,441564))

x=[]
for i in no:
    x.append(model.predict(pd.DataFrame(use2_pred.loc[i]).T))
    
predicted_temp = [round(float(x),2) for array in x for x in array]
predicted_temp

[17.08,
 11.57,
 26.35,
 6.63,
 10.87,
 18.43,
 12.2,
 25.53,
 17.27,
 9.52,
 18.24,
 19.15,
 19.88,
 26.17,
 28.27,
 28.33,
 16.68,
 7.94,
 8.09,
 27.0,
 17.23,
 11.54,
 18.06,
 28.64,
 28.66]

In [26]:
result = compare.loc[no]
result.reset_index()
result['Predicted'] = predicted_temp
result

Unnamed: 0,Actual,Predicted
182717,17.17,17.08
25921,11.42,11.57
25103,26.31,26.35
146097,6.8,6.63
207333,11.0,10.87
269481,18.63,18.43
386903,12.32,12.2
297214,25.24,25.53
270906,17.21,17.27
319598,10.02,9.52


In [28]:
result = pd.DataFrame(result)

In [None]:
result['Relative_Error'] = abs(result['Predicted'] - result['Actual']) / result['Actual'] * 100

In [31]:
threshold = 5.0
success_count = (result["Relative_Error"] <= threshold).sum()
total = len(result)
success_rate = (success_count / total) * 100

In [33]:
print(f"Success Rate: {success_rate:.2f}%")

Success Rate: 96.00%
