**Importing all the necessary Libraries for the task**

In [2]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
import seaborn as sns
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import make_scorer

**Starting the Exploratory Data Analysis phase**

Importing the data file in CSV format

In [3]:
#the variable data_ad holds the dataframe for Adamawa state data
data_ad = pd.read_csv('data_ad.csv')
print(data_ad.shape)


(33, 12)


Part of the Preprocessing phase(the other part was carried out using MS Excel)

In [4]:
#converting year column to date and index
data_ad['Year'] = pd.to_datetime(data_ad['Year'], format='%Y')
data_ad.set_index(data_ad['Year'].dt.strftime('%Y'), inplace=True)

In [5]:
#seperating features from target variable
#X = data_ad.drop(['yield'], axis=1)  # Features
X = data_ad[['all_sky_par', 're_humidity', 't_max', 'w_speed', 's_pressure']]
#X = data_ad.drop(['yield'], axis=1)
y = data_ad['yield']
#scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# spliting the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.20, random_state=244)

In [6]:
#for training without Feature selection
X1 = data_ad.drop(['yield' , 'Year'], axis=1)
scaler = StandardScaler()
X1_scaled = scaler.fit_transform(X1)
# spliting the data into 80% training and 20% testing
X1_train, X1_test, y_train, y_test = train_test_split(X1_scaled, y, test_size=0.20, random_state=244)

Viewing the training and testing set

In [7]:
#A funtion for calculating the MAPE value
def calculateMape(test_y, pred_y):
    # Convert to NumPy array for element-wise operations
    test_y = np.array(test_y)
    pred_y = np.array(pred_y)

    # Calculate absolute percentage errors
    absolute_percentage_errors = np.abs((test_y - pred_y) / test_y)

    # Replace any potential division by zero with NaN
    absolute_percentage_errors[np.isnan(absolute_percentage_errors)] = 0

    # Calculate Mean Absolute Percentage Error (MAPE)
    mape = np.mean(absolute_percentage_errors) * 100

    return mape

**Training the Linear Rigression**

In [8]:
# Training the model
model = LinearRegression()

model.fit(X_train, y_train)

# Testing the model
pred = model.predict(X_test)

mse = mean_squared_error(y_test, pred)
print("Mean Squared Error:", mse)

#Creating a dataframe 'Model_perf' to tabulate the values for each of the matrices used
Model_perf = pd.DataFrame(columns=['Model_Name','MSE','R2_Score', 'MAE', 'RMSE', 'MAPE'])

# Evaluating the model
LR_mse = mean_squared_error(y_test, pred)
LR_R2 = r2_score(y_test, pred)
LR_mae = mean_absolute_error(y_test, pred)
LR_rmse = sqrt(LR_mse)
LR_mape = calculateMape(y_test, pred)

#Adding a row for the evaluation of Linear regression model
new_row = {'Model_Name':'Linear Regression','MSE':LR_mse , 'R2_Score': LR_R2, 'MAE' :LR_mae, 'RMSE' :LR_rmse, 'MAPE':LR_mape}
Model_perf.loc[0] = new_row

Mean Squared Error: 57816496.03888893


In [9]:
#WITHOUT FEATURE SELECTION
model = LinearRegression()

# Training the model
model.fit(X1_train, y_train)

# Making predictions on the testing set
pred1 = model.predict(X1_test)

# Evaluating the model
mse = mean_squared_error(y_test, pred1)
print("Mean Squared Error:", mse)

#Creating a dataframe 'Model_perf' to tabulate the values for each of the matrices used
Model_perf2 = pd.DataFrame(columns=['Model_Name','MSE','R2_Score', 'MAE', 'RMSE', 'MAPE'])

#Evaluating the model
LR_mse = mean_squared_error(y_test, pred1)
LR_R2 = r2_score(y_test, pred1)
LR_mae = mean_absolute_error(y_test, pred1)
LR_rmse = sqrt(LR_mse)
LR_mape = calculateMape(y_test, pred1)

#Adding a row for the evaluation of Linear regression model
new_row = {'Model_Name':'Linear Regression','MSE':LR_mse , 'R2_Score': LR_R2, 'MAE' : LR_mae, 'RMSE' :LR_rmse, 'MAPE':LR_mape}
Model_perf2.loc[0] = new_row

Mean Squared Error: 325212023.72414726


In [34]:
# Performing 5-fold cross-validation using Linear Regression
k = 3 
kf = KFold(n_splits=k, shuffle=True, random_state=50)

#for MSE metric
cv_scores = cross_val_score(model, X1_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_mse = -cv_scores
print(f"Cross-validation RMSE scores: {cv_mse}")
print(f"Average MSE: {np.mean(cv_mse)}")


#for R2 Metric
cv_r2_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='r2')
average_r2 = np.mean(cv_r2_scores) 
print(f"Cross validation R² Score : {[round(score, 4) for score in cv_r2_scores]}")
print(f"Average R² : {average_r2:.2f}")

#for MAE
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_mean_absolute_error')

mae_scores = -cv_scores  # Converting scores to positive

print("Cross-validation MAE scores:", mae_scores)
print("Average MAE:", np.mean(mae_scores))

#for Mape
mape_scorer = make_scorer(calculateMape, greater_is_better=False)
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring=mape_scorer)
mape_scores = -cv_scores

print("Cross-validation MAPE scores:", cv_scores)
print("Average MAPE:", np.mean(mape_scores))

#for RMSE
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-validation RMSE scores: {cv_rmse}")
print(f"Average RMSE: {np.mean(cv_rmse)}")

Cross_val_perf = pd.DataFrame(columns=['Model_Name','MSE','R2_Score', 'MAE', 'RMSE', 'MAPE'])

new_row = {'Model_Name':'Linear Regression','MSE':np.mean(cv_mse) , 'R2_Score': average_r2, 'MAE' : np.mean(mae_scores), 'RMSE' :np.mean(cv_rmse), 'MAPE':np.mean(mape_scores)}
Cross_val_perf.loc[0] = new_row

Cross-validation RMSE scores: [5.26885369e+08 4.53958094e+08 2.95247971e+08]
Average MSE: 425363811.5005476
Cross validation R² Score : [0.6473, 0.3879, 0.635]
Average R² : 0.56
Cross-validation MAE scores: [ 9256.54785184 12107.81930453 11374.07185139]
Average MAE: 10912.813002588446
Cross-validation MAPE scores: [-4.26771143 -5.66954444 -4.96658082]
Average MAPE: 4.967945563854138
Cross-validation RMSE scores: [11177.01007448 17671.63153315 16565.8710261 ]
Average RMSE: 15138.170877909033


In [11]:
### XGBoost WITHOUT FEATURE SELECTION 

from xgboost import XGBRegressor
XG_boost = XGBRegressor(max_depth = 3,n_estimators = 300 )

XG_boost.fit(X1_train , y_train)
y1_pred = XG_boost.predict(X1_test)
XG_mse = mean_squared_error(y_test,y1_pred)
XG_R2 = r2_score(y_test,y1_pred)
XG_mae = mean_absolute_error(y_test, y1_pred)
XG_rmse = mean_squared_error(y_test, y1_pred, squared=False)
XG_mape = calculateMape(y_test, y1_pred)


new_row1 = {'Model_Name':'XGBoost','MSE':XG_mse , 'R2_Score': XG_R2, 'MAE' :XG_mae, 'RMSE' :XG_rmse, 'MAPE' :XG_mape}
Model_perf2.loc[1] = new_row1

"XGBoost Regressor"

In [10]:
### Trying the XGB regressor now

from xgboost import XGBRegressor
XG_boost = XGBRegressor(max_depth = 3,n_estimators = 300 )

XG_boost.fit(X_train , y_train)
y_pred = XG_boost.predict(X_test)
XG_mse = mean_squared_error(y_test,y_pred)
XG_R2 = r2_score(y_test,y_pred)
XG_mae = mean_absolute_error(y_test, y_pred)
XG_rmse = mean_squared_error(y_test, y_pred, squared=False)
XG_mape = calculateMape(y_test, y_pred)


new_row1 = {'Model_Name':'XGBoost','MSE':XG_mse , 'R2_Score': XG_R2, 'MAE' :XG_mae, 'RMSE' :XG_rmse, 'MAPE' :XG_mape}
Model_perf.loc[1] = new_row1

In [35]:
# Performing 3-fold cross-validation using XGBoost
k = 3 
kf = KFold(n_splits=k, shuffle=True, random_state=50)

#for MSE metric
cv_scores = cross_val_score(XG_boost, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_mse = -cv_scores
print(f"Cross-validation MSE scores: {cv_mse}")
print(f"Average MSE: {np.mean(cv_mse)}")


#for R2 Metric
cv_r2_scores = cross_val_score(XG_boost, X_scaled, y, cv=kf, scoring='r2')
average_r2 = np.mean(cv_r2_scores) 
print(f"Cross validation R² Score : {[round(score, 4) for score in cv_r2_scores]}")
print(f"Average R² : {average_r2:.2f}")

#for MAE
cv_scores = cross_val_score(XG_boost, X_scaled, y, cv=kf, scoring='neg_mean_absolute_error')

mae_scores = -cv_scores  # Converting scores to positive

print("Cross-validation MAE scores:", mae_scores)
print("Average MAE:", np.mean(mae_scores))
#print("Standard deviation of MAE:", np.std(mae_scores))


#for Mape
mape_scorer = make_scorer(calculateMape, greater_is_better=False)
cv_scores = cross_val_score(XG_boost, X_scaled, y, cv=kf, scoring=mape_scorer)
mape_scores = -cv_scores

print("Cross-validation MAPE scores:", cv_scores)
print("Average MAPE:", np.mean(mape_scores))


#for RMSE
cv_scores = cross_val_score(XG_boost, X_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-validation RMSE scores: {cv_rmse}")
print(f"Average RMSE: {np.mean(cv_rmse)}")

new_row2 = {'Model_Name':'XGBoost','MSE':np.mean(cv_mse) , 'R2_Score': average_r2, 'MAE' : np.mean(mae_scores), 'RMSE' :np.mean(cv_rmse), 'MAPE':np.mean(mape_scores)}
Cross_val_perf.loc[1] = new_row2

Cross-validation MSE scores: [2.13770846e+08 7.23013756e+08 4.85279134e+08]
Average MSE: 474021245.3029347
Cross validation R² Score : [0.3965, -0.4172, 0.3546]
Average R² : 0.11
Cross-validation MAE scores: [ 9789.44681818 19199.26153409 12779.51181818]
Average MAE: 13922.740056818182
Cross-validation MAPE scores: [-4.38941596 -9.10723379 -5.23365739]
Average MAPE: 6.24343571267525
Cross-validation RMSE scores: [14620.90440447 26888.91511741 22029.05204753]
Average RMSE: 21179.62385647095


In [13]:
# WTIHOUT FEATURE SELECTION Initialize KNN Regressor
knn = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model
knn.fit(X1_train, y_train)

# Make predictions
y1_predict = knn.predict(X1_test)

#Evaluate 
KNN_mse = mean_squared_error(y_test,y1_predict)
KNN_R2 = r2_score(y_test,y1_predict)
KNN_mae = mean_absolute_error(y_test, y1_predict)
KNN_rmse = mean_squared_error(y_test, y1_predict, squared=False)
KNN_mape = calculateMape(y_test, y1_predict)


new_row2 = {'Model_Name':'KNN','MSE':KNN_mse , 'R2_Score': KNN_R2, 'MAE' :KNN_mae, 'RMSE' :KNN_rmse, 'MAPE' :KNN_mape}
Model_perf2.loc[2] = new_row2

In [12]:
# Initialize KNN Regressor
knn = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors

# Train the model
knn.fit(X_train, y_train)

# Make predictions
y_predict = knn.predict(X_test)

#Evaluate 
KNN_mse = mean_squared_error(y_test,y_predict)
KNN_R2 = r2_score(y_test,y_predict)
KNN_mae = mean_absolute_error(y_test, y_predict)
KNN_rmse = mean_squared_error(y_test, y_predict, squared=False)
KNN_mape = calculateMape(y_test, y_predict)


new_row2 = {'Model_Name':'KNN','MSE':KNN_mse , 'R2_Score': KNN_R2, 'MAE' :KNN_mae, 'RMSE' :KNN_rmse, 'MAPE' :KNN_mape}
Model_perf.loc[2] = new_row2

In [36]:
# Performing 5-fold cross-validation using K Nearest Neighbours
k = 3 
kf = KFold(n_splits=k, shuffle=True, random_state=50)

#for MSE metric
cv_scores = cross_val_score(knn, X1_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_mse = -cv_scores
print(f"Cross-validation RMSE scores: {cv_mse}")
print(f"Average MSE: {np.mean(cv_mse)}")


#for R2 Metric
cv_r2_scores = cross_val_score(knn, X1_scaled, y, cv=kf, scoring='r2')
average_r2 = np.mean(cv_r2_scores) 
print(f"Cross validation R² Score : {[round(score, 4) for score in cv_r2_scores]}")
print(f"Average R² : {average_r2:.2f}")

#for MAE
cv_scores = cross_val_score(knn, X1_scaled, y, cv=kf, scoring='neg_mean_absolute_error')

mae_scores = -cv_scores  # Converting scores to positive

print("Cross-validation MAE scores:", mae_scores)
print("Average MAE:", np.mean(mae_scores))
#print("Standard deviation of MAE:", np.std(mae_scores))


#for Mape
mape_scorer = make_scorer(calculateMape, greater_is_better=False)
cv_scores = cross_val_score(knn, X1_scaled, y, cv=kf, scoring=mape_scorer)
mape_scores = -cv_scores

print("Cross-validation MAPE scores:", cv_scores)
print("Average MAPE:", np.mean(mape_scores))


#for RMSE
cv_scores = cross_val_score(knn, X1_scaled, y, cv=kf, scoring='neg_mean_squared_error')
cv_rmse = np.sqrt(-cv_scores)
print(f"Cross-validation RMSE scores: {cv_rmse}")
print(f"Average RMSE: {np.mean(cv_rmse)}")

new_row3 = {'Model_Name':'KNN','MSE':np.mean(cv_mse) , 'R2_Score': average_r2, 'MAE' : np.mean(mae_scores), 'RMSE' :np.mean(cv_rmse), 'MAPE':np.mean(mape_scores)}
Cross_val_perf.loc[2] = new_row3

Cross-validation RMSE scores: [1.88815365e+08 3.25514404e+08 3.90163933e+08]
Average MSE: 301497900.91303104
Cross validation R² Score : [0.467, 0.3619, 0.4811]
Average R² : 0.44
Cross-validation MAE scores: [10994.13945455 13632.92090909 15426.76181818]
Average MAE: 13351.274060606062
Cross-validation MAPE scores: [-5.03676603 -6.23899604 -6.66860251]
Average MAPE: 5.981454859098181
Cross-validation RMSE scores: [13741.0103493  18042.01773581 19752.56776574]
Average RMSE: 17178.53195028377


In [37]:
Cross_val_perf

Unnamed: 0,Model_Name,MSE,R2_Score,MAE,RMSE,MAPE
0,Linear Regression,425363800.0,0.556743,10912.813003,15138.170878,4.967946
1,XGBoost,474021200.0,0.111303,13922.740057,21179.623856,6.243436
2,KNN,301497900.0,0.436671,13351.274061,17178.53195,5.981455


In [14]:
Model_perf2

Unnamed: 0,Model_Name,MSE,R2_Score,MAE,RMSE,MAPE
0,Linear Regression,325212000.0,0.455188,16748.488682,18033.635899,7.648347
1,XGBoost,289833900.0,0.514455,11570.315714,17024.50823,4.983372
2,KNN,211392700.0,0.645864,10719.874286,14539.349596,4.57997


In [15]:
Model_perf

Unnamed: 0,Model_Name,MSE,R2_Score,MAE,RMSE,MAPE
0,Linear Regression,57816500.0,0.903143,6211.029384,7603.715936,2.804602
1,XGBoost,341611900.0,0.427714,11623.070179,18482.746754,4.965539
2,KNN,88816110.0,0.851211,6534.076571,9424.22978,2.819843
