In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.pipeline import Pipeline as ImbPipeline  
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, recall_score
from imblearn.over_sampling import SMOTE
import time

In [3]:
df = pd.read_csv('weather_forecast_data.csv')
df.head()

Unnamed: 0,Temperature,Humidity,Wind_Speed,Cloud_Cover,Pressure,Rain
0,23.720338,89.592641,7.335604,50.501694,1032.378759,rain
1,27.879734,46.489704,5.952484,4.990053,992.61419,no rain
2,25.069084,83.072843,1.371992,14.855784,1007.23162,no rain
3,23.62208,74.367758,7.050551,67.255282,982.632013,rain
4,20.59137,96.858822,4.643921,47.676444,980.825142,no rain


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Temperature  2500 non-null   float64
 1   Humidity     2500 non-null   float64
 2   Wind_Speed   2500 non-null   float64
 3   Cloud_Cover  2500 non-null   float64
 4   Pressure     2500 non-null   float64
 5   Rain         2500 non-null   object 
dtypes: float64(5), object(1)
memory usage: 117.3+ KB


In [6]:
df["Rain"].value_counts()

Rain
no rain    2186
rain        314
Name: count, dtype: int64

In [12]:
X = df.drop(columns=['Rain'])
y = df['Rain']

In [13]:
le = LabelEncoder()
y = le.fit_transform(y)

In [37]:
rain_label = le.transform(["rain"])[0]
rain_label

1

In [15]:
pd.Series(y).value_counts()

0    2186
1     314
Name: count, dtype: int64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['Rain'])

In [48]:
models = {
    "Logistic Regression": LogisticRegression(class_weight="balanced", max_iter=1000),
    "SVM": SVC(kernel="rbf", class_weight="balanced", probability=True),
    "Random Forest": RandomForestClassifier(n_estimators=300, class_weight="balanced"),
    "XGBoost": XGBClassifier(n_estimators = 300,  eval_metric="logloss")
}

results = []

In [None]:
for name, model in models.items():
    # Build pipeline: Scale → SMOTE → Model
    pipe = ImbPipeline(steps=[
        ("scaler", StandardScaler()),
        ("smote", SMOTE(random_state=42)),
        ("model", model)
    ])
    
    # Fit with timing
    start = time.time()
    pipe.fit(X_train, y_train)
    end = time.time()
    
    # Predictions & accuracy
    y_pred = pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, pos_label=rain_label) 
    print(f"{name} Classification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
    print(f"{name} Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

    # Save results
    results.append({
        "Model": name,
        "Fit Time (s)": round(end - start, 3),
        "Accuracy": round(acc, 3),
        "Recall (Rain)": round(rec, 3)
    })

Logistic Regression Classification Report:
               precision    recall  f1-score   support

     no rain       0.99      0.89      0.94       437
        rain       0.55      0.92      0.69        63

    accuracy                           0.89       500
   macro avg       0.77      0.91      0.81       500
weighted avg       0.93      0.89      0.90       500

Logistic Regression Confusion Matrix:
 [[389  48]
 [  5  58]]
SVM Classification Report:
               precision    recall  f1-score   support

     no rain       1.00      0.95      0.97       437
        rain       0.73      0.98      0.84        63

    accuracy                           0.95       500
   macro avg       0.86      0.97      0.90       500
weighted avg       0.96      0.95      0.95       500

SVM Confusion Matrix:
 [[414  23]
 [  1  62]]
Random Forest Classification Report:
               precision    recall  f1-score   support

     no rain       1.00      1.00      1.00       437
        rain       

In [50]:
results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Fit Time (s),Accuracy,Recall (Rain)
0,Logistic Regression,0.009,0.894,0.921
1,SVM,0.288,0.952,0.984
2,Random Forest,0.882,0.998,1.0
3,XGBoost,0.071,0.998,1.0
