# Machine Learning Using Logistic Regression, SVM and Random Forest Models

## Import necessary libraries

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

## Load Dataset

In [3]:
import os
# Load dataset
dataset_path = r'..\..\datasets\B2_Monthly_Rainfall.csv'
rainfall_data = pd.read_csv(dataset_path)

In [4]:
#Display the first few rows of the dataset
print("Dataset Preview:")
print(rainfall_data.head())

Dataset Preview:
                SUBDIVISIONS  YEAR    JAN   FEB   MAR    APR    MAY    JUN  \
0  ANDAMAN & NICOBAR ISLANDS  2019  173.8   5.8  15.8   35.3  230.9  662.2   
1  ANDAMAN & NICOBAR ISLANDS  2021   42.7  48.8  38.3  150.2  414.1  315.9   
2  ANDAMAN & NICOBAR ISLANDS  2016   72.0  15.8   5.4    2.4  191.1  429.4   
3  ANDAMAN & NICOBAR ISLANDS  2017  228.7   5.6  33.0  108.3  275.8  349.1   
4  ANDAMAN & NICOBAR ISLANDS  2018  167.3  36.2  21.5   90.0  372.5  518.4   

     JUL    AUG    SEP    OCT    NOV    DEC  ANNUAL FLOOD  
0  212.0  860.4  596.8  136.6  131.9   24.5  3086.0   YES  
1  535.3  506.5  667.3  413.1  265.7   95.5  3493.4   YES  
2  301.2  227.7  604.3  287.2  181.7  533.7  2851.9    NO  
3  389.4  414.7  372.8  263.0  205.9  243.7  2890.0    NO  
4  239.1  415.7  395.9  298.9  239.6  318.4  3113.5   YES  


In [5]:
# Basic information
print("\nDataset Info:")
print(rainfall_data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 230 entries, 0 to 229
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   SUBDIVISIONS  230 non-null    object 
 1   YEAR          230 non-null    int64  
 2   JAN           230 non-null    float64
 3   FEB           230 non-null    float64
 4   MAR           230 non-null    float64
 5   APR           230 non-null    float64
 6   MAY           230 non-null    float64
 7   JUN           230 non-null    float64
 8   JUL           230 non-null    float64
 9   AUG           230 non-null    float64
 10  SEP           230 non-null    float64
 11  OCT           230 non-null    float64
 12  NOV           230 non-null    float64
 13  DEC           230 non-null    float64
 14  ANNUAL        230 non-null    float64
 15  FLOOD         230 non-null    object 
dtypes: float64(13), int64(1), object(2)
memory usage: 28.9+ KB
None


In [6]:
# Summary statistics
print("\nStatistical Summary:")
print(rainfall_data.describe())

# Check for missing values
print("\nMissing Values:")
print(rainfall_data.isnull().sum())


Statistical Summary:
              YEAR         JAN         FEB         MAR         APR  \
count   230.000000  230.000000  230.000000  230.000000  230.000000   
mean   2018.604348   19.849783   16.223696   25.775435   46.352609   
std       1.709204   38.686777   28.797662   39.663047   71.412769   
min    2016.000000    0.000000    0.000000    0.000000    0.000000   
25%    2017.000000    1.125000    0.900000    2.925000    4.400000   
50%    2019.000000    5.550000    5.550000    8.225000   21.350000   
75%    2020.000000   18.437500   16.650000   34.175000   58.225000   
max    2021.000000  255.500000  216.900000  221.800000  441.600000   

              MAY          JUN          JUL          AUG          SEP  \
count  230.000000   230.000000   230.000000   230.000000   230.000000   
mean   102.496957   219.764783   347.205870   287.548913   235.443261   
std    114.154876   203.006312   259.893817   197.733588   157.162515   
min      0.000000    18.600000    18.500000    18.30000

In [7]:
# Value counts for target variable (if applicable)
if 'Flood' in rainfall_data.columns:
    print("\nFlood value counts:")
    print(rainfall_data['Flood'].value_counts())

## Data Cleaning and Encoding

In [8]:
# Normalize all column names to lowercase and strip whitespace/punctuation for consistency
rainfall_data.columns = [col.lower().strip().replace('?', '') for col in rainfall_data.columns]

# Accept both 'flood' and 'flood?' as target column
target_col = None
if 'flood' in rainfall_data.columns:
    target_col = 'flood'
elif 'flood?' in [col + '?' for col in rainfall_data.columns]:
    target_col = 'flood?'
else:
    raise ValueError("The dataset does not contain a 'Flood' or 'Flood?' column for the target variable.")

# Encode the target column
rainfall_data[target_col] = (
    rainfall_data[target_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({'yes': 1, 'no': 0, '': 0, 'nan': 0})
    .astype(float)  # Convert to float first to handle '1.0', '0.0'
    .astype(int)    # Then to int
)

# If you want to always use 'flood' as the column name for the rest of your code:
if target_col != 'flood':
    rainfall_data.rename(columns={target_col: 'flood'}, inplace=True)
    target_col = 'flood'

  .replace({'yes': 1, 'no': 0, '': 0, 'nan': 0})


In [9]:
# Encode non-numeric features (excluding target)
non_numeric_columns = rainfall_data.select_dtypes(include=['object']).columns.drop('flood', errors='ignore')
if len(non_numeric_columns) > 0:
    print(f"\nEncoding non-numeric columns: {non_numeric_columns.tolist()}")
    for column in non_numeric_columns:
        rainfall_data[column] = LabelEncoder().fit_transform(rainfall_data[column].astype(str))
else:
    print("\nNo non-numeric columns to encode.")


Encoding non-numeric columns: ['subdivisions']


In [10]:
# Fill missing values with median (safer for numeric data)
rainfall_data.fillna(rainfall_data.median(numeric_only=True), inplace=True)

### Sort by Station_Names

In [11]:
lag = 1

# 1. Sort data by station and date/order
rainfall_data = rainfall_data.sort_values(by=['subdivisions', 'year'])

# 2. Group by Station_Names
grouped = rainfall_data.groupby('subdivisions')

# 3. Shift the 'flood' column (or other features) within each group
# For example, to predict next day's flood based on today's data:
rainfall_data['flood_next_day'] = grouped['flood'].shift(-lag)

# 4. Drop rows with NaN in 'Flood_next_day' (last row of each station group)
rainfall_data = rainfall_data.dropna(subset=['flood_next_day'])

# Now, df contains data with the lagged target, correctly aligned within each station

In [12]:
pd.DataFrame(grouped).shape, rainfall_data.shape

((77, 2), (153, 17))

## Data Splitting

In [13]:
import pandas as pd

X = rainfall_data.drop(columns=['flood', 'flood_next_day'], axis=1)
y = rainfall_data['flood_next_day']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

## Define Models

In [14]:
# Define pipelines for Logistic Regression
logistic_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(C=0.5, penalty='l2', solver='liblinear', class_weight='balanced', random_state=42))
])

# Define pipeline for SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LinearSVC(C=1.0, max_iter=2000, dual=False, class_weight='balanced', random_state=42))
])

# Define pipeline for Random Forest
rf_pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])


## Model Evaluation

In [15]:
# Train and evaluate Logistic Regression model
try:
    logistic_pipeline.fit(X_train, y_train)
    y_pred_lr = logistic_pipeline.predict(X_val)
    print("\nLogistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_lr))
    print("\nLogistic Regression Validation Classification Report:")
    print(classification_report(y_val, y_pred_lr))
    print("Logistic Regression Predictions:", y_pred_lr[:10])

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_results_lr = cross_validate(logistic_pipeline, X, y, cv=skf, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
    cv_scores_lr_acc = cv_results_lr['test_accuracy']
    cv_scores_lr_auc = cv_results_lr['test_roc_auc']

    print("Logistic Regression CV Accuracy: %.3f ± %.3f" % (cv_scores_lr_acc.mean(), cv_scores_lr_acc.std()))
    print("Logistic Regression CV ROC-AUC: %.3f ± %.3f" % (cv_scores_lr_auc.mean(), cv_scores_lr_auc.std()))

except Exception as e:
    print(f"Error training the Logistic Regression model: {e}")



Logistic Regression Validation Accuracy: 0.967741935483871

Logistic Regression Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98        30
         1.0       0.00      0.00      0.00         1

    accuracy                           0.97        31
   macro avg       0.48      0.50      0.49        31
weighted avg       0.94      0.97      0.95        31

Logistic Regression Predictions: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression CV Accuracy: 0.954 ± 0.026
Logistic Regression CV ROC-AUC: 0.969 ± 0.045


In [16]:
# Train and evaluate SVM model
try:
    svm_pipeline.fit(X_train, y_train)
    y_pred_svm = svm_pipeline.predict(X_val)
    print("\nSVM Validation Accuracy:", accuracy_score(y_val, y_pred_svm))
    print("\nSVM Validation Classification Report:")
    print(classification_report(y_val, y_pred_svm))
    print("SVM Predictions:", y_pred_svm[:10])

    # Cross-validation
    cv_results_svm = cross_validate(svm_pipeline, X, y, cv=skf, scoring=['accuracy'], n_jobs=-1)
    cv_scores_svm = cv_results_svm['test_accuracy']

    print("SVM Cross-Validation Accuracy: %.3f ± %.3f" % (cv_scores_svm.mean(), cv_scores_svm.std()))

except Exception as e:
    print(f"Error training the SVM model: {e}")



SVM Validation Accuracy: 0.967741935483871

SVM Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98        30
         1.0       0.00      0.00      0.00         1

    accuracy                           0.97        31
   macro avg       0.48      0.50      0.49        31
weighted avg       0.94      0.97      0.95        31

SVM Predictions: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Cross-Validation Accuracy: 0.955 ± 0.026


In [17]:
# Train and evaluate Random Forest model
try:
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_val)
    print("\nRandom Forest Validation Accuracy:", accuracy_score(y_val, y_pred_rf))
    print("\nRandom Forest Validation Classification Report:")
    print(classification_report(y_val, y_pred_rf))
    print("Random Forest Predictions:", y_pred_rf[:10])

    # Cross-validation
    cv_results_rf = cross_validate(rf_pipeline, X, y, cv=skf, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
    cv_scores_rf = cv_results_rf['test_accuracy']

    print("Random Forest Cross-Validation Accuracy: %.3f ± %.3f" % (cv_scores_rf.mean(), cv_scores_rf.std()))

except Exception as e:
    print(f"Error training the Random Forest model: {e}")



Random Forest Validation Accuracy: 0.967741935483871

Random Forest Validation Classification Report:
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98        30
         1.0       0.00      0.00      0.00         1

    accuracy                           0.97        31
   macro avg       0.48      0.50      0.49        31
weighted avg       0.94      0.97      0.95        31

Random Forest Predictions: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Cross-Validation Accuracy: 0.955 ± 0.026


## Comparison Summary

In [18]:
# Store model results
results = {
    "Logistic Regression": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_lr),
        "Cross-Validation Accuracy Mean": cv_scores_lr_acc.mean(),
        "Cross-Validation Accuracy Std": cv_scores_lr_acc.std()
    },
    "SVM": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_svm),
        "Cross-Validation Accuracy Mean": cv_scores_svm.mean(),
        "Cross-Validation Accuracy Std": cv_scores_svm.std()
    },
    "Random Forest": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_rf),
        "Cross-Validation Accuracy Mean": cv_scores_rf.mean(),
        "Cross-Validation Accuracy Std": cv_scores_rf.std()
    }
}

# Display comparison results
print("\n=== Model Comparison Summary ===")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

# Determine best model based on average CV accuracy
best_model = max(results.items(), key=lambda item: item[1]["Cross-Validation Accuracy Mean"])
best_model_name = best_model[0]



=== Model Comparison Summary ===

Logistic Regression:
  Validation Accuracy: 0.9677
  Cross-Validation Accuracy Mean: 0.9544
  Cross-Validation Accuracy Std: 0.0261

SVM:
  Validation Accuracy: 0.9677
  Cross-Validation Accuracy Mean: 0.9546
  Cross-Validation Accuracy Std: 0.0257

Random Forest:
  Validation Accuracy: 0.9677
  Cross-Validation Accuracy Mean: 0.9546
  Cross-Validation Accuracy Std: 0.0257


## Select Best Model

In [19]:
best_model_name = max(results.items(), key=lambda m: m[1]["Cross-Validation Accuracy Mean"])[0]
print(f"\nBest Model Selected: {best_model_name}")


Best Model Selected: SVM


In [20]:
# Retrain the best model on the entire dataset
if best_model_name == "Logistic Regression":
    best_pipeline = logistic_pipeline
elif best_model_name == "SVM":
    best_pipeline = svm_pipeline
elif best_model_name == "Random Forest":
    best_pipeline = rf_pipeline

best_pipeline.fit(X, y)

In [21]:
import joblib

# Assuming your model is named 'model'
model = best_pipeline

# Save the model to a file
os.makedirs('trained_model', exist_ok=True)  # Create folder if it doesn't exist
joblib.dump(model, os.path.join('trained_model', 'ML_verB2_trained.pkl'))

['trained_model\\ML_verB2_trained.pkl']

## Final prediction using latest row (simulate next month)

In [22]:
# Assuming the last row of the original dataframe represents the latest available data
latest_data_point = rainfall_data.iloc[-1].copy()

# Create a dictionary for the next entry based on the required columns
next_entry_features = {
    'subdivisions': latest_data_point['subdivisions'],
    'year': latest_data_point['year'] + 1,
    'jan': latest_data_point['jan'],
    'feb': latest_data_point['feb'],
    'mar': latest_data_point['mar'],
    'apr': latest_data_point['apr'],
    'may': latest_data_point['may'],
    'jun': latest_data_point['jun'],
    'jul': latest_data_point['jul'],
    'aug': latest_data_point['aug'],
    'sep': latest_data_point['sep'],
    'oct': latest_data_point['oct'],
    'nov': latest_data_point['nov'],
    'dec': latest_data_point['dec'],
    'annual': latest_data_point['annual']
}

# Convert the dictionary to a pandas DataFrame with a single row
next_year_data = pd.DataFrame([next_entry_features])

# Ensure the columns are in the same order as the training data
next_year_data = next_year_data[X.columns]

# Predict the flood status for the next month using the best model
# 'best_pipeline' was determined and trained on the full data previously
next_year_flood_prediction = best_pipeline.predict(next_year_data)

# Print the prediction
print("\nPrediction for the next month's flood status (0: No Flood, 1: Flood):")
print(next_year_flood_prediction[0])

# You can optionally print the input data used for prediction
print("\nInput features used for next month prediction:")
next_year_data



Prediction for the next month's flood status (0: No Flood, 1: Flood):
0.0

Input features used for next month prediction:


Unnamed: 0,subdivisions,year,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec,annual
0,76.0,2021.0,14.8,15.0,3.1,4.4,35.05,48.0,222.9,160.8,117.0,50.65,5.05,17.3,694.05
