# Machine Learning Using Logistic Regression, SVM and Random Forest Models

## Import necessary libraries

In [100]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

## Load Dataset

In [101]:
import os
# Load dataset
dataset_path = r'..\..\datasets\A_Flood_Dataset.csv'
rainfall_data = pd.read_csv(dataset_path)

In [102]:
#Display the first few rows of the dataset
print("Dataset Preview:")
print(rainfall_data.head())

Dataset Preview:
   Sl Station_Names  Year  Month  Max_Temp  Min_Temp  Rainfall  \
0   0       Barisal  1949      1      29.4      12.3       0.0   
1   1       Barisal  1949      2      33.9      15.2       9.0   
2   2       Barisal  1949      3      36.7      20.2       8.0   
3   3       Barisal  1949      4      33.9      23.9     140.0   
4   4       Barisal  1949      5      35.6      25.0     217.0   

   Relative_Humidity  Wind_Speed  Cloud_Coverage  Bright_Sunshine  \
0               68.0    0.453704             0.6         7.831915   
1               63.0    0.659259             0.9         8.314894   
2               59.0    1.085185             1.5         8.131915   
3               71.0    1.772222             3.9         8.219149   
4               76.0    1.703704             4.1         7.046809   

   Station_Number     X_COR     Y_COR  LATITUDE  LONGITUDE  ALT   Period  \
0           41950  536809.8  510151.9      22.7      90.36    4  1949.01   
1           41950  

In [103]:
# Basic information
print("\nDataset Info:")
print(rainfall_data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20544 entries, 0 to 20543
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Sl                 20544 non-null  int64  
 1   Station_Names      20544 non-null  object 
 2   Year               20544 non-null  int64  
 3   Month              20544 non-null  int64  
 4   Max_Temp           20544 non-null  float64
 5   Min_Temp           20544 non-null  float64
 6   Rainfall           20544 non-null  float64
 7   Relative_Humidity  20544 non-null  float64
 8   Wind_Speed         20544 non-null  float64
 9   Cloud_Coverage     20544 non-null  float64
 10  Bright_Sunshine    20544 non-null  float64
 11  Station_Number     20544 non-null  int64  
 12  X_COR              20544 non-null  float64
 13  Y_COR              20544 non-null  float64
 14  LATITUDE           20544 non-null  float64
 15  LONGITUDE          20544 non-null  float64
 16  ALT    

In [104]:
# Summary statistics
print("\nStatistical Summary:")
print(rainfall_data.describe())

# Check for missing values
print("\nMissing Values:")
print(rainfall_data.isnull().sum())


Statistical Summary:
                 Sl          Year         Month      Max_Temp      Min_Temp  \
count  20544.000000  20544.000000  20544.000000  20544.000000  20544.000000   
mean   10271.500000   1985.332944      6.500000     33.450739     21.166872   
std     5930.686301     17.610799      3.452137      2.956401      4.949587   
min        0.000000   1948.000000      1.000000     21.600000      6.200000   
25%     5135.750000   1972.000000      3.750000     31.700000     16.900000   
50%    10271.500000   1987.000000      6.500000     33.900000     23.400000   
75%    15407.250000   2000.000000      9.250000     35.400000     25.400000   
max    20543.000000   2013.000000     12.000000     44.000000     28.100000   

           Rainfall  Relative_Humidity    Wind_Speed  Cloud_Coverage  \
count  20544.000000       20544.000000  20544.000000    20544.000000   
mean     198.776621          79.497375      1.415049        3.485827   
std      240.693197           7.667925      1.0424

In [105]:
# Value counts for target variable (if applicable)
if 'Flood' in rainfall_data.columns:
    print("\nFlood value counts:")
    print(rainfall_data['Flood'].value_counts())

## Data Cleaning and Encoding

In [106]:
# Normalize all column names to lowercase and strip whitespace/punctuation for consistency
rainfall_data.columns = [col.lower().strip().replace('?', '') for col in rainfall_data.columns]

# Accept both 'flood' and 'flood?' as target column
target_col = None
if 'flood' in rainfall_data.columns:
    target_col = 'flood'
elif 'flood?' in [col + '?' for col in rainfall_data.columns]:
    target_col = 'flood?'
else:
    raise ValueError("The dataset does not contain a 'Flood' or 'Flood?' column for the target variable.")

# Encode the target column
rainfall_data[target_col] = (
    rainfall_data[target_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({'yes': 1, 'no': 0, '': 0, 'nan': 0})
    .astype(float)  # Convert to float first to handle '1.0', '0.0'
    .astype(int)    # Then to int
)

# If you want to always use 'flood' as the column name for the rest of your code:
if target_col != 'flood':
    rainfall_data.rename(columns={target_col: 'flood'}, inplace=True)
    target_col = 'flood'

In [107]:
# Encode non-numeric features (excluding target)
non_numeric_columns = rainfall_data.select_dtypes(include=['object']).columns.drop('flood', errors='ignore')
if len(non_numeric_columns) > 0:
    print(f"\nEncoding non-numeric columns: {non_numeric_columns.tolist()}")
    for column in non_numeric_columns:
        rainfall_data[column] = LabelEncoder().fit_transform(rainfall_data[column].astype(str))
else:
    print("\nNo non-numeric columns to encode.")


Encoding non-numeric columns: ['station_names']


In [108]:
# Fill missing values with median (safer for numeric data)
rainfall_data.fillna(rainfall_data.median(numeric_only=True), inplace=True)

### Sort by Station_Names

In [109]:
lag = 1

# 1. Sort data by station and date/order
rainfall_data = rainfall_data.sort_values(by=['station_names', 'year', 'month', 'sl'])

# 2. Group by Station_Names
grouped = rainfall_data.groupby('station_names')

# 3. Shift the 'flood' column (or other features) within each group
# For example, to predict next day's flood based on today's data:
rainfall_data['flood_next_day'] = grouped['flood'].shift(-lag)

# 4. Drop rows with NaN in 'Flood_next_day' (last row of each station group)
rainfall_data = rainfall_data.dropna(subset=['flood_next_day'])

# Now, df contains data with the lagged target, correctly aligned within each station

In [110]:
pd.DataFrame(grouped).shape, rainfall_data.shape

((33, 2), (20511, 20))

## Data Splitting

In [111]:
import pandas as pd

X = rainfall_data.drop(columns=['sl', 'flood', 'flood_next_day', 'station_names', 'station_number', 'x_cor', 'y_cor', 'period'], axis=1)
y = rainfall_data['flood_next_day']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

## Define Models

In [112]:
# Define pipelines for Logistic Regression
logistic_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(C=0.5, penalty='l2', solver='liblinear', class_weight='balanced', random_state=42))
])

# Define pipeline for SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LinearSVC(C=1.0, max_iter=2000, dual=False, class_weight='balanced', random_state=42))
])

# Define pipeline for Random Forest
rf_pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])


## Model Evaluation

In [113]:
# Train and evaluate Logistic Regression model
try:
    logistic_pipeline.fit(X_train, y_train)
    y_pred_lr = logistic_pipeline.predict(X_val)
    print("\nLogistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_lr))
    print("\nLogistic Regression Validation Classification Report:")
    print(classification_report(y_val, y_pred_lr))
    print("Logistic Regression Predictions:", y_pred_lr[:10])

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_results_lr = cross_validate(logistic_pipeline, X, y, cv=skf, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
    cv_scores_lr_acc = cv_results_lr['test_accuracy']
    cv_scores_lr_auc = cv_results_lr['test_roc_auc']

    print("Logistic Regression CV Accuracy: %.3f ± %.3f" % (cv_scores_lr_acc.mean(), cv_scores_lr_acc.std()))
    print("Logistic Regression CV ROC-AUC: %.3f ± %.3f" % (cv_scores_lr_auc.mean(), cv_scores_lr_auc.std()))

except Exception as e:
    print(f"Error training the Logistic Regression model: {e}")



Logistic Regression Validation Accuracy: 0.7930782354374848

Logistic Regression Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.96      0.77      0.86      3281
         1.0       0.49      0.88      0.63       822

    accuracy                           0.79      4103
   macro avg       0.73      0.82      0.74      4103
weighted avg       0.87      0.79      0.81      4103

Logistic Regression Predictions: [1. 1. 1. 0. 0. 0. 0. 0. 0. 0.]
Logistic Regression CV Accuracy: 0.791 ± 0.005
Logistic Regression CV ROC-AUC: 0.892 ± 0.007


In [114]:
# Train and evaluate SVM model
try:
    svm_pipeline.fit(X_train, y_train)
    y_pred_svm = svm_pipeline.predict(X_val)
    print("\nSVM Validation Accuracy:", accuracy_score(y_val, y_pred_svm))
    print("\nSVM Validation Classification Report:")
    print(classification_report(y_val, y_pred_svm))
    print("SVM Predictions:", y_pred_svm[:10])

    # Cross-validation
    cv_results_svm = cross_validate(svm_pipeline, X, y, cv=skf, scoring=['accuracy'], n_jobs=-1)
    cv_scores_svm = cv_results_svm['test_accuracy']

    print("SVM Cross-Validation Accuracy: %.3f ± %.3f" % (cv_scores_svm.mean(), cv_scores_svm.std()))

except Exception as e:
    print(f"Error training the SVM model: {e}")



SVM Validation Accuracy: 0.7655374116500122

SVM Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      0.73      0.83      3281
         1.0       0.46      0.91      0.61       822

    accuracy                           0.77      4103
   macro avg       0.71      0.82      0.72      4103
weighted avg       0.87      0.77      0.79      4103

SVM Predictions: [1. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
SVM Cross-Validation Accuracy: 0.769 ± 0.004


In [115]:
# Train and evaluate Random Forest model
try:
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_val)
    print("\nRandom Forest Validation Accuracy:", accuracy_score(y_val, y_pred_rf))
    print("\nRandom Forest Validation Classification Report:")
    print(classification_report(y_val, y_pred_rf))
    print("Random Forest Predictions:", y_pred_rf[:10])

    # Cross-validation
    cv_results_rf = cross_validate(rf_pipeline, X, y, cv=skf, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
    cv_scores_rf = cv_results_rf['test_accuracy']

    print("Random Forest Cross-Validation Accuracy: %.3f ± %.3f" % (cv_scores_rf.mean(), cv_scores_rf.std()))

except Exception as e:
    print(f"Error training the Random Forest model: {e}")



Random Forest Validation Accuracy: 0.8720448452351938

Random Forest Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92      3281
         1.0       0.73      0.58      0.65       822

    accuracy                           0.87      4103
   macro avg       0.81      0.76      0.78      4103
weighted avg       0.87      0.87      0.87      4103

Random Forest Predictions: [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Random Forest Cross-Validation Accuracy: 0.886 ± 0.003


## Comparison Summary

In [116]:
# Store model results
results = {
    "Logistic Regression": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_lr),
        "Cross-Validation Accuracy Mean": cv_scores_lr_acc.mean(),
        "Cross-Validation Accuracy Std": cv_scores_lr_acc.std()
    },
    "SVM": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_svm),
        "Cross-Validation Accuracy Mean": cv_scores_svm.mean(),
        "Cross-Validation Accuracy Std": cv_scores_svm.std()
    },
    "Random Forest": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_rf),
        "Cross-Validation Accuracy Mean": cv_scores_rf.mean(),
        "Cross-Validation Accuracy Std": cv_scores_rf.std()
    }
}

# Display comparison results
print("\n=== Model Comparison Summary ===")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

# Determine best model based on average CV accuracy
best_model = max(results.items(), key=lambda item: item[1]["Cross-Validation Accuracy Mean"])
best_model_name = best_model[0]



=== Model Comparison Summary ===

Logistic Regression:
  Validation Accuracy: 0.7931
  Cross-Validation Accuracy Mean: 0.7909
  Cross-Validation Accuracy Std: 0.0049

SVM:
  Validation Accuracy: 0.7655
  Cross-Validation Accuracy Mean: 0.7686
  Cross-Validation Accuracy Std: 0.0037

Random Forest:
  Validation Accuracy: 0.8720
  Cross-Validation Accuracy Mean: 0.8862
  Cross-Validation Accuracy Std: 0.0030


## Select Best Model

In [117]:
best_model_name = max(results.items(), key=lambda m: m[1]["Cross-Validation Accuracy Mean"])[0]
print(f"\nBest Model Selected: {best_model_name}")


Best Model Selected: Random Forest


In [118]:
# Retrain the best model on the entire dataset
if best_model_name == "Logistic Regression":
    best_pipeline = logistic_pipeline
elif best_model_name == "SVM":
    best_pipeline = svm_pipeline
elif best_model_name == "Random Forest":
    best_pipeline = rf_pipeline

best_pipeline.fit(X, y)

In [119]:
import joblib

# Assuming your model is named 'model'
model = best_pipeline

# Save the model to a file
os.makedirs('trained_model', exist_ok=True)  # Create folder if it doesn't exist
joblib.dump(model, os.path.join('trained_model', 'ML_verA_trained.pkl'))

['trained_model\\ML_verA_trained.pkl']

## Final prediction using latest row (simulate next month)

In [123]:
X.describe()

Unnamed: 0,year,month,max_temp,min_temp,rainfall,relative_humidity,wind_speed,cloud_coverage,bright_sunshine,latitude,longitude,alt
count,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0,20511.0
mean,1985.288431,6.491151,33.456271,21.177647,199.095749,79.497103,1.414556,3.489909,6.420085,23.326847,90.493087,13.359514
std,17.58993,3.44785,2.955371,4.94599,240.755106,7.673207,1.042613,2.082902,1.748348,1.155257,1.108729,13.530111
min,1948.0,1.0,21.6,6.2,0.0,34.0,0.0,0.0,0.0,20.87,88.56,0.0
25%,1972.0,3.0,31.7,17.0,8.0,75.0,0.7,1.6,4.965517,22.64,89.55,4.0
50%,1987.0,6.0,33.9,23.4,112.0,81.0,1.2,3.3,6.8,23.17,90.41,7.0
75%,2000.0,9.0,35.4,25.4,312.0,85.0,1.9,5.5,7.8,24.29,91.46,19.0
max,2013.0,12.0,44.0,28.1,2072.0,97.0,11.2,7.9,11.0,25.72,92.26,63.0


In [125]:
# Assuming the last row of the original dataframe represents the latest available data
latest_data_point = X.iloc[-1].copy()

# You need to provide hypothetical feature values for the next month.
# These values should be in the same order as the columns in X.
# Replace these example values with the values you want to use for prediction.
# Example values - you should adjust these based on your understanding of the data and future forecasts
next_month_features = {
    'year': latest_data_point['year'], # Assuming same year for simplicity, or increment if at end of year
    'month': (latest_data_point['month'] % 12) + 1, # Increment month, wrapping around
    'max_temp': 35.0,  # Example: Slightly higher max temp
    'min_temp': 22.0,  # Example: Slightly higher min temp
    'rainfall': 400.0, # Example: Higher rainfall
    'relative_humidity': 85.0, # Example: Higher humidity
    'wind_speed': 1.5,  # Example: Slightly higher wind speed
    'cloud_coverage': 2.0, # Example: Slightly more cloud coverage
    'bright_sunshine': 4.0, # Example: Slightly less bright sunshine
    'latitude': latest_data_point['latitude'],
    'longitude': latest_data_point['longitude'],
    'alt': latest_data_point['alt'], # Assuming same altitude
}

# Convert the dictionary to a pandas DataFrame with a single row
next_month_data = pd.DataFrame([next_month_features])

# Ensure the columns are in the same order as the training data
next_month_data = next_month_data[X.columns]

# Predict the flood status for the next month using the best model
# 'best_pipeline' was determined and trained on the full data previously
next_month_flood_prediction = best_pipeline.predict(next_month_data)

# Print the prediction
print("\nPrediction for the next month's flood status (0: No Flood, 1: Flood):")
print(next_month_flood_prediction[0])

# You can optionally print the input data used for prediction
print("\nInput features used for next month prediction:")
next_month_data



Prediction for the next month's flood status (0: No Flood, 1: Flood):
0.0

Input features used for next month prediction:


Unnamed: 0,year,month,max_temp,min_temp,rainfall,relative_humidity,wind_speed,cloud_coverage,bright_sunshine,latitude,longitude,alt
0,2013.0,12.0,35.0,22.0,400.0,85.0,1.5,2.0,4.0,20.87,92.26,4.0
