##Reading and visualizing the data

In [1]:
import pandas as pd
climatedata = pd.read_csv("1981-2024Daily summaries.csv", parse_dates=["DATE"],index_col="DATE") #Make the year an index column
climatedata

ModuleNotFoundError: No module named 'pandas'

In [None]:
#checking information on the data
print(climatedata.info())

In [None]:
## Creating a copy of the dataset to work with
Core_weather = climatedata[["T2M_MAX","T2M_MIN","PRECTOTCORR","QV2M","WS2M","GWETTOP"]].copy()

In [None]:
#Renaming the columns appropriately
Core_weather.columns = ["temp_max","temp_min","precip","Humidity","w_speed","s_wetness"]

In [None]:
Core_weather

In [None]:
##According to the documentation, values with -999 are missing or were not recorded
# to check percentage of null values per column
Core_weather.apply(pd.isnull).sum()/Core_weather.shape[0]
# As we can see below there are no null values

In [None]:
#datatypes
Core_weather.dtypes

#We find that all the data types are numerical

In [None]:
#Check the index to make sure its the right type
Core_weather.index

In [None]:
#Since the index data -type is an object, it is treated as a string whereas it is a numerical data 
# Let's convert it to a pandas date time that makes manipulation of values and subsetting of values easier
Core_weather.index = pd.to_datetime(Core_weather.index)


In [None]:
Core_weather.index


In [None]:
#subsetting - Helps in avg monthly or yearly analysis
#check index by year, month
Core_weather.index.year


In [None]:
Core_weather.index.month


In [None]:
#plot the temp_max and temp_min
Core_weather[["temp_max","temp_min"]].plot()

#We find that there are outliers with temperature values -999

In [None]:
##Above you can notice that there were some tempretarures with values close to-999...this means that they were missing or were not recorded on that day
#To fix that, lets find the sum of the cells with those values
Core_weather.apply(lambda x: (x==-999).sum())

In [None]:
import numpy as np
# First replace the value-999 with NaN

Core_weather.replace(-999, np.nan, inplace= True)

In [None]:
#Then forward fill the NaN
Core_weather.ffill(inplace=True)

In [None]:
#To confirm if the -999 still exist, as we can see below it was replaced
Core_weather.apply(lambda x: (x==-999).sum())

In [None]:
#Lets plot again to see the difference
Core_weather[["temp_max","temp_min"]].plot()

#The plots appear uniform without outliers

In [None]:
#Plotting the precipitation data
Core_weather[["precip"]].plot()

In [None]:
#Plotting the humidity data
Core_weather[["Humidity"]].plot()

In [None]:
#Plotting the wind speed data
Core_weather[["w_speed"]].plot()

In [None]:
#Plotting the soil wetness data
Core_weather[["s_wetness"]].plot()

In [None]:
#To perform seasonal or yearly analysis
Core_weather.groupby(Core_weather.index.year).sum()

In [None]:
#To perform seasonal analysis by month
Core_weather.groupby(Core_weather.index.month).sum()

In [None]:
Core_weather

In [None]:
Core_weather.info()

In [None]:

Core_weather.index = pd.to_datetime(Core_weather.index)

# Compute the long-term average for each variable by day of the year
Core_weather['day_of_year'] = Core_weather.index.dayofyear

# Group by day of year to calculate long-term averages (e.g., average precipitation per day of year across all years)
long_term_avg = Core_weather.groupby('day_of_year')[['s_wetness', 'w_speed', 'precip', 'Humidity', 'temp_max', 'temp_min']].mean()

#Merge the long-term averages back to the original data
Core_weather = pd.merge(Core_weather, long_term_avg, on='day_of_year', suffixes=('', '_long_term'))

# Define thresholds for drought
precipitation_threshold = 0.75  # 75% of the long-term average
soil_moisture_threshold = 0.75  # 75% of the long-term average
temp_max_threshold = 2  # 2°C above the long-term average
def determine_drought(row):
    # Check drought conditions element-wise
    precip_drought = row['precip'] < (precipitation_threshold * row['precip_long_term'])
    temp_max_drought = row['temp_max'] > (row['temp_max_long_term'] + temp_max_threshold)
    soil_moisture_drought = row['s_wetness'] < (soil_moisture_threshold * row['s_wetness_long_term'])


    # Use element-wise logical AND (&) and check if all conditions are True
    if (precip_drought  &temp_max_drought & soil_moisture_drought).all():
        return 1  # Drought occurred
    else:
        return 0  # No drought

# Apply the function to each row
Core_weather['drought_occurred'] = Core_weather.apply(determine_drought, axis=1)

# Optional: Check the distribution of drought occurrences
print(Core_weather['drought_occurred'].value_counts())


In [None]:
# Now you can separate features (X) and target (y)
X = Core_weather[["temp_max","temp_min","precip","Humidity","w_speed","s_wetness"]]  # Features
y = Core_weather['drought_occurred']  # Target: 0 = No Drought, 1 = Drought

In [None]:
#Standardize the data (important for machine learning models)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_transformed = scaler.fit_transform(Core_weather[["temp_max","temp_min","precip","Humidity","w_speed","s_wetness"]])

In [None]:
#split the data into training and test sets, use the scaled feature data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.3, random_state=42)

#To display the size of the training and testing sets
print(f"Training data size: {X_train.shape}")
print(f"Test data size: {X_test.shape}")

In [None]:
#Add standardized features back to the dataframe
Standardised_data = pd.DataFrame(X_transformed, columns =X.columns)
Standardised_data.head()

In [None]:
# Random Forest Model - Training and Evaluation
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix

## Initialize RandomForestClassifier with 100 estimators (trees)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the Random Forest model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the Random Forest model
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy * 100:.2f}%")

# Confusion Matrix for Random Forest
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, rf_predictions))

# Classification Report for Random Forest
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_predictions))



In [None]:
# Hyperparameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10]
}

# Initialize the GridSearchCV
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search_rf.fit(X_train, y_train)

# Get the best parameters from the grid search
best_rf_model = grid_search_rf.best_estimator_

# Evaluate the best model from grid search
best_rf_predictions = best_rf_model.predict(X_test)
best_rf_accuracy = accuracy_score(y_test, best_rf_predictions)
print(f"Optimized Random Forest Accuracy: {best_rf_accuracy * 100:.2f}%")


In [None]:
#. Support Vector Machine (SVM) - Training and Evaluation
from sklearn.svm import SVC
# Initialize the SVM model with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the SVM model
svm_model.fit(X_train, y_train)

# Make predictions on the test set
svm_predictions = svm_model.predict(X_test)

# Evaluate the SVM model
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy * 100:.2f}%")

# Confusion Matrix for SVM
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, svm_predictions))

# Classification Report for SVM
print("SVM Classification Report:")
print(classification_report(y_test, svm_predictions))


In [None]:
# Hyperparameter tuning with GridSearchCV for SVM
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

# Initialize the GridSearchCV for SVM
grid_search_svm = GridSearchCV(estimator=svm_model, param_grid=svm_param_grid, cv=3, verbose=2, n_jobs=-1)

# Fit the grid search to the data
grid_search_svm.fit(X_train, y_train)

# Get the best parameters from the grid search
best_svm_model = grid_search_svm.best_estimator_

# Evaluate the best model from grid search
best_svm_predictions = best_svm_model.predict(X_test)
best_svm_accuracy = accuracy_score(y_test, best_svm_predictions)
print(f"Optimized SVM Accuracy: {best_svm_accuracy * 100:.2f}%")


In [None]:
# Random Forest Confusion Matrix Heatmap
import seaborn as sns
import matplotlib.pyplot as plt
rf_cm = confusion_matrix(y_test, rf_predictions)
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Drought', 'Drought'], yticklabels=['No Drought', 'Drought'])
plt.title("Random Forest Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# SVM Confusion Matrix Heatmap
svm_cm = confusion_matrix(y_test, svm_predictions)
sns.heatmap(svm_cm, annot=True, fmt='d', cmap='Blues', xticklabels=['No Drought', 'Drought'], yticklabels=['No Drought', 'Drought'])
plt.title("SVM Confusion Matrix")
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


In [None]:
import geopandas as gpd
from shapely.geometry import Point

# Load shapefile of Kenya 
kenya_shapefile = gpd.read_file('KEN_adm0.shp')  


In [None]:
kenya_shapefile

In [None]:
import matplotlib.pyplot as plt
# Eldama Ravine coordinates
eldama_ravine_coords = [(35.7286 , 0.0483)]  # Longitude, Latitude
predicted_drought = [rf_model.predict([[25, 10, 15,12,1.2,0.8]])[0]]  # Example input features for Eldama Ravine

# Convert coordinates to GeoDataFrame
geometry = [Point(xy) for xy in eldama_ravine_coords]
geo_df = gpd.GeoDataFrame({'Latitude': [0.0483], 'Longitude': [35.7286], 'Predicted_Drought': predicted_drought}, geometry=geometry)

# Plot the map
fig, ax = plt.subplots(figsize=(10, 10))
kenya_shapefile.plot(ax=ax, color='lightgray')  # Plot the Kenya boundary
geo_df.plot(ax=ax, color=geo_df['Predicted_Drought'].map({1: 'red', 0: 'green'}), markersize=100)

# Add label for Eldama Ravine
for idx, row in geo_df.iterrows():
    ax.text(row['Longitude'], row['Latitude'], 'Eldama Ravine', fontsize=12, ha='center', color='black')

plt.title('Drought Prediction for Eldama Ravine')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()


In [None]:
#Evaluation and Accuracy of the prediction model

In [None]:
from sklearn.metrics import accuracy_score

# Evaluate the Random Forest model for Eldama Ravine
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Model Accuracy for Eldama Ravine: {rf_accuracy * 100:.2f}%")

# Evaluate the SVM model for Eldama Ravine
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"SVM Model Accuracy for Eldama Ravine: {svm_accuracy * 100:.2f}%")


In [None]:
##We can see above that the random forest model is more accurate compared to the support vector Machine model(svm)
##I therefor applied the random forest machine model in the Flask App.