# Machine Learning Using Logistic Regression, SVM and Random Forest Models

## Import necessary libraries

In [39]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from sklearn.model_selection import train_test_split, cross_validate, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier

## Load Dataset

In [40]:
import os
# Load dataset
dataset_path = r'..\..\datasets\B1_Monthly_Rainfall.csv'
rainfall_data = pd.read_csv(dataset_path)

In [41]:
#Display the first few rows of the dataset
print("Dataset Preview:")
print(rainfall_data.head())

Dataset Preview:
                 SUBDIVISION  YEAR   JAN    FEB   MAR    APR    MAY    JUN  \
0  ANDAMAN & NICOBAR ISLANDS  1901  49.2   87.1  29.2    2.3  528.8  517.5   
1  ANDAMAN & NICOBAR ISLANDS  1902   0.0  159.8  12.2    0.0  446.1  537.1   
2  ANDAMAN & NICOBAR ISLANDS  1903  12.7  144.0   0.0    1.0  235.1  479.9   
3  ANDAMAN & NICOBAR ISLANDS  1904   9.4   14.7   0.0  202.4  304.5  495.1   
4  ANDAMAN & NICOBAR ISLANDS  1905   1.3    0.0   3.3   26.9  279.5  628.7   

     JUL    AUG    SEP    OCT    NOV    DEC  ANNUAL Flood  
0  365.1  481.1  332.6  388.5  558.2   33.6  3373.2   Yes  
1  228.9  753.7  666.2  197.2  359.0  160.5  3520.7   Yes  
2  728.4  326.7  339.0  181.2  284.4  225.0  2957.4    No  
3  502.0  160.1  820.4  222.2  308.7   40.1  3079.6   Yes  
4  368.7  330.5  297.0  260.7   25.4  344.7  2566.7    No  


In [42]:
# Basic information
print("\nDataset Info:")
print(rainfall_data.info())


Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4090 entries, 0 to 4089
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   SUBDIVISION  4090 non-null   object 
 1   YEAR         4090 non-null   int64  
 2   JAN          4090 non-null   float64
 3   FEB          4090 non-null   float64
 4   MAR          4090 non-null   float64
 5   APR          4090 non-null   float64
 6   MAY          4090 non-null   float64
 7   JUN          4090 non-null   float64
 8   JUL          4090 non-null   float64
 9   AUG          4090 non-null   float64
 10  SEP          4090 non-null   float64
 11  OCT          4090 non-null   float64
 12  NOV          4090 non-null   float64
 13  DEC          4090 non-null   float64
 14  ANNUAL       4090 non-null   float64
 15  Flood        4090 non-null   object 
dtypes: float64(13), int64(1), object(2)
memory usage: 511.4+ KB
None


In [43]:
# Summary statistics
print("\nStatistical Summary:")
print(rainfall_data.describe())

# Check for missing values
print("\nMissing Values:")
print(rainfall_data.isnull().sum())


Statistical Summary:
              YEAR          JAN          FEB          MAR          APR  \
count  4090.000000  4090.000000  4090.000000  4090.000000  4090.000000   
mean   1958.321271    18.818484    21.644792    27.252494    42.714548   
std      33.148944    33.521719    35.762010    46.829179    67.264863   
min    1901.000000     0.000000     0.000000     0.000000     0.000000   
25%    1930.000000     0.600000     0.600000     1.000000     3.000000   
50%    1959.000000     5.900000     6.600000     7.800000    15.500000   
75%    1987.000000    21.950000    26.600000    31.100000    49.375000   
max    2015.000000   583.700000   403.500000   605.600000   595.100000   

               MAY          JUN          JUL          AUG          SEP  \
count  4090.000000  4090.000000  4090.000000  4090.000000  4090.000000   
mean     84.868044   228.928020   346.496968   289.897506   197.003056   
std     122.556801   233.535693   269.352685   187.702293   135.266708   
min       0.000

In [44]:
# Value counts for target variable (if applicable)
if 'Flood' in rainfall_data.columns:
    print("\nFlood value counts:")
    print(rainfall_data['Flood'].value_counts())


Flood value counts:
Flood
No     3756
Yes     334
Name: count, dtype: int64


## Data Cleaning and Encoding

In [45]:
# Normalize all column names to lowercase and strip whitespace/punctuation for consistency
rainfall_data.columns = [col.lower().strip().replace('?', '') for col in rainfall_data.columns]

# Accept both 'flood' and 'flood?' as target column
target_col = None
if 'flood' in rainfall_data.columns:
    target_col = 'flood'
elif 'flood?' in [col + '?' for col in rainfall_data.columns]:
    target_col = 'flood?'
else:
    raise ValueError("The dataset does not contain a 'Flood' or 'Flood?' column for the target variable.")

# Encode the target column
rainfall_data[target_col] = (
    rainfall_data[target_col]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({'yes': 1, 'no': 0, '': 0, 'nan': 0})
    .astype(float)  # Convert to float first to handle '1.0', '0.0'
    .astype(int)    # Then to int
)

# If you want to always use 'flood' as the column name for the rest of your code:
if target_col != 'flood':
    rainfall_data.rename(columns={target_col: 'flood'}, inplace=True)
    target_col = 'flood'

  .replace({'yes': 1, 'no': 0, '': 0, 'nan': 0})


In [46]:
# Encode non-numeric features (excluding target)
non_numeric_columns = rainfall_data.select_dtypes(include=['object']).columns.drop('flood', errors='ignore')
if len(non_numeric_columns) > 0:
    print(f"\nEncoding non-numeric columns: {non_numeric_columns.tolist()}")
    for column in non_numeric_columns:
        rainfall_data[column] = LabelEncoder().fit_transform(rainfall_data[column].astype(str))
else:
    print("\nNo non-numeric columns to encode.")


Encoding non-numeric columns: ['subdivision']


In [47]:
# Fill missing values with median (safer for numeric data)
rainfall_data.fillna(rainfall_data.median(numeric_only=True), inplace=True)

### Sort by Station_Names

In [48]:
lag = 1

# 1. Sort data by station and date/order
rainfall_data = rainfall_data.sort_values(by=['subdivision', 'year'])

# 2. Group by Station_Names
grouped = rainfall_data.groupby('subdivision')

# 3. Shift the 'flood' column (or other features) within each group
# For example, to predict next day's flood based on today's data:
rainfall_data['flood_next_day'] = grouped['flood'].shift(-lag)

# 4. Drop rows with NaN in 'Flood_next_day' (last row of each station group)
rainfall_data = rainfall_data.dropna(subset=['flood_next_day'])

# Now, df contains data with the lagged target, correctly aligned within each station

In [49]:
pd.DataFrame(grouped).shape, rainfall_data.shape

((36, 2), (4054, 17))

## Data Splitting

In [50]:
import pandas as pd

X = rainfall_data.drop(columns=['flood', 'flood_next_day'], axis=1)
y = rainfall_data['flood_next_day']

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)

## Define Models

In [51]:
# Define pipelines for Logistic Regression
logistic_pipeline = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(C=0.5, penalty='l2', solver='liblinear', class_weight='balanced', random_state=42))
])

# Define pipeline for SVM
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', LinearSVC(C=1.0, max_iter=2000, dual=False, class_weight='balanced', random_state=42))
])

# Define pipeline for Random Forest
rf_pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])


## Model Evaluation

In [52]:
# Train and evaluate Logistic Regression model
try:
    logistic_pipeline.fit(X_train, y_train)
    y_pred_lr = logistic_pipeline.predict(X_val)
    print("\nLogistic Regression Validation Accuracy:", accuracy_score(y_val, y_pred_lr))
    print("\nLogistic Regression Validation Classification Report:")
    print(classification_report(y_val, y_pred_lr))
    print("Logistic Regression Predictions:", y_pred_lr[:10])

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    cv_results_lr = cross_validate(logistic_pipeline, X, y, cv=skf, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
    cv_scores_lr_acc = cv_results_lr['test_accuracy']
    cv_scores_lr_auc = cv_results_lr['test_roc_auc']

    print("Logistic Regression CV Accuracy: %.3f ± %.3f" % (cv_scores_lr_acc.mean(), cv_scores_lr_acc.std()))
    print("Logistic Regression CV ROC-AUC: %.3f ± %.3f" % (cv_scores_lr_auc.mean(), cv_scores_lr_auc.std()))

except Exception as e:
    print(f"Error training the Logistic Regression model: {e}")



Logistic Regression Validation Accuracy: 0.9901356350184957

Logistic Regression Validation Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00       811
         1.0       0.00      0.00      0.00         0

    accuracy                           0.99       811
   macro avg       0.50      0.50      0.50       811
weighted avg       1.00      0.99      1.00       811

Logistic Regression Predictions: [0. 1. 1. 1. 1. 0. 1. 1. 0. 1.]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression CV Accuracy: 0.893 ± 0.006
Logistic Regression CV ROC-AUC: 0.966 ± 0.004


In [53]:
# Train and evaluate SVM model
try:
    svm_pipeline.fit(X_train, y_train)
    y_pred_svm = svm_pipeline.predict(X_val)
    print("\nSVM Validation Accuracy:", accuracy_score(y_val, y_pred_svm))
    print("\nSVM Validation Classification Report:")
    print(classification_report(y_val, y_pred_svm))
    print("SVM Predictions:", y_pred_svm[:10])

    # Cross-validation
    cv_results_svm = cross_validate(svm_pipeline, X, y, cv=skf, scoring=['accuracy'], n_jobs=-1)
    cv_scores_svm = cv_results_svm['test_accuracy']

    print("SVM Cross-Validation Accuracy: %.3f ± %.3f" % (cv_scores_svm.mean(), cv_scores_svm.std()))

except Exception as e:
    print(f"Error training the SVM model: {e}")



SVM Validation Accuracy: 0.9889025893958077

SVM Validation Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99       811
         1.0       0.00      0.00      0.00         0

    accuracy                           0.99       811
   macro avg       0.50      0.49      0.50       811
weighted avg       1.00      0.99      0.99       811

SVM Predictions: [1. 1. 1. 1. 1. 0. 1. 1. 0. 1.]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


SVM Cross-Validation Accuracy: 0.889 ± 0.006


In [54]:
# Train and evaluate Random Forest model
try:
    rf_pipeline.fit(X_train, y_train)
    y_pred_rf = rf_pipeline.predict(X_val)
    print("\nRandom Forest Validation Accuracy:", accuracy_score(y_val, y_pred_rf))
    print("\nRandom Forest Validation Classification Report:")
    print(classification_report(y_val, y_pred_rf))
    print("Random Forest Predictions:", y_pred_rf[:10])

    # Cross-validation
    cv_results_rf = cross_validate(rf_pipeline, X, y, cv=skf, scoring=['accuracy', 'roc_auc'], n_jobs=-1)
    cv_scores_rf = cv_results_rf['test_accuracy']

    print("Random Forest Cross-Validation Accuracy: %.3f ± %.3f" % (cv_scores_rf.mean(), cv_scores_rf.std()))

except Exception as e:
    print(f"Error training the Random Forest model: {e}")



Random Forest Validation Accuracy: 1.0

Random Forest Validation Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       811

    accuracy                           1.00       811
   macro avg       1.00      1.00      1.00       811
weighted avg       1.00      1.00      1.00       811

Random Forest Predictions: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Random Forest Cross-Validation Accuracy: 0.942 ± 0.009


## Comparison Summary

In [55]:
# Store model results
results = {
    "Logistic Regression": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_lr),
        "Cross-Validation Accuracy Mean": cv_scores_lr_acc.mean(),
        "Cross-Validation Accuracy Std": cv_scores_lr_acc.std()
    },
    "SVM": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_svm),
        "Cross-Validation Accuracy Mean": cv_scores_svm.mean(),
        "Cross-Validation Accuracy Std": cv_scores_svm.std()
    },
    "Random Forest": {
        "Validation Accuracy": accuracy_score(y_val, y_pred_rf),
        "Cross-Validation Accuracy Mean": cv_scores_rf.mean(),
        "Cross-Validation Accuracy Std": cv_scores_rf.std()
    }
}

# Display comparison results
print("\n=== Model Comparison Summary ===")
for model_name, metrics in results.items():
    print(f"\n{model_name}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

# Determine best model based on average CV accuracy
best_model = max(results.items(), key=lambda item: item[1]["Cross-Validation Accuracy Mean"])
best_model_name = best_model[0]



=== Model Comparison Summary ===

Logistic Regression:
  Validation Accuracy: 0.9901
  Cross-Validation Accuracy Mean: 0.8934
  Cross-Validation Accuracy Std: 0.0058

SVM:
  Validation Accuracy: 0.9889
  Cross-Validation Accuracy Mean: 0.8890
  Cross-Validation Accuracy Std: 0.0063

Random Forest:
  Validation Accuracy: 1.0000
  Cross-Validation Accuracy Mean: 0.9418
  Cross-Validation Accuracy Std: 0.0089


## Select Best Model

In [56]:
best_model_name = max(results.items(), key=lambda m: m[1]["Cross-Validation Accuracy Mean"])[0]
print(f"\nBest Model Selected: {best_model_name}")


Best Model Selected: Random Forest


In [57]:
# Retrain the best model on the entire dataset
if best_model_name == "Logistic Regression":
    best_pipeline = logistic_pipeline
elif best_model_name == "SVM":
    best_pipeline = svm_pipeline
elif best_model_name == "Random Forest":
    best_pipeline = rf_pipeline

best_pipeline.fit(X, y)

In [58]:
import joblib

# Assuming your model is named 'model'
model = best_pipeline

# Save the model to a file
os.makedirs('trained_model', exist_ok=True)  # Create folder if it doesn't exist
joblib.dump(model, os.path.join('trained_model', 'ML_verB1_trained.pkl'))

['trained_model\\ML_verB1_trained.pkl']

## Final prediction using latest row (simulate next month)

In [59]:
# Assuming the last row of the original dataframe represents the latest available data
latest_data_point = rainfall_data.iloc[-1].copy()

# Create a dictionary for the next entry based on the required columns
next_entry_features = {
    'subdivision': latest_data_point['subdivision'],
    'year': latest_data_point['year'] + 1,
    'jan': latest_data_point['jan'],
    'feb': latest_data_point['feb'],
    'mar': latest_data_point['mar'],
    'apr': latest_data_point['apr'],
    'may': latest_data_point['may'],
    'jun': latest_data_point['jun'],
    'jul': latest_data_point['jul'],
    'aug': latest_data_point['aug'],
    'sep': latest_data_point['sep'],
    'oct': latest_data_point['oct'],
    'nov': latest_data_point['nov'],
    'dec': latest_data_point['dec'],
    'annual': latest_data_point['annual']
}

# Convert the dictionary to a pandas DataFrame with a single row
next_year_data = pd.DataFrame([next_entry_features])

# Ensure the columns are in the same order as the training data
next_year_data = next_year_data[X.columns]

# Predict the flood status for the next month using the best model
# 'best_pipeline' was determined and trained on the full data previously
next_year_flood_prediction = best_pipeline.predict(next_year_data)

# Print the prediction
print("\nPrediction for the next month's flood status (0: No Flood, 1: Flood):")
print(next_year_flood_prediction[0])

# You can optionally print the input data used for prediction
print("\nInput features used for next month prediction:")
next_year_data



Prediction for the next month's flood status (0: No Flood, 1: Flood):
0.0

Input features used for next month prediction:


Unnamed: 0,subdivision,year,jan,feb,mar,apr,may,jun,jul,aug,sep,oct,nov,dec,annual
0,35.0,2015.0,48.3,29.4,22.6,5.3,11.0,22.0,151.6,81.0,84.7,14.6,0.0,16.3,486.9
