In [16]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
import joblib
import numpy as np

# Load dataset
df = pd.read_csv("Anurag University rainfall dataset.csv")
df['Date & Time'] = pd.to_datetime(df['Date & Time'])
df['Rain Next 2 Hours'] = df['Rain - mm'].shift(-1) > 0
df.dropna(subset=['Rain Next 2 Hours'], inplace=True)

# Add lag features
for lag in range(1, 3):
    df[f'Rain_Lag_{lag}'] = df['Rain - mm'].shift(lag)
df.dropna(inplace=True)

# Prepare features and target
features = df.drop(['Date & Time', 'Rain - mm', 'Rain Next 2 Hours'], axis=1)
target = df['Rain Next 2 Hours']
categorical_cols = ['Prevailing Wind Direction', 'High Wind Direction']

# Preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), features.select_dtypes(include=['int64', 'float64']).columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Models
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'KNeighbors': KNeighborsClassifier(),
    'SVC': SVC(probability=True),
    'DecisionTree': DecisionTreeClassifier()
}

# Save the results in a dictionary
model_results = {}

# Perform Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5)

for model_name, model in models.items():
    print(f"Training {model_name}...")
    
    # Pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    accuracies = []
    confusion_matrices = []
    roc_data = {'fpr': [], 'tpr': [], 'roc_auc': []}
    class_report_dict = {
        'precision': [],
        'recall': [],
        'f1-score': [],
        'support': []
    }
    
    for train_index, test_index in skf.split(features, target):
        X_train, X_test = features.iloc[train_index], features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        # Fit model
        pipeline.fit(X_train, y_train)
        
        # Predictions
        y_pred = pipeline.predict(X_test)
        y_pred_prob = pipeline.predict_proba(X_test)[:, 1]  # For ROC
        
        # Metrics
        accuracies.append(accuracy_score(y_test, y_pred))
        confusion_matrices.append(confusion_matrix(y_test, y_pred))
        
        # Classification report
        report = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in report.items():
            if label in ['accuracy', 'macro avg', 'weighted avg']:
                continue
            for metric_name, value in metrics.items():
                class_report_dict[metric_name].append(value)
        
        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
        roc_data['fpr'].append(fpr)
        roc_data['tpr'].append(tpr)
        roc_data['roc_auc'].append(auc(fpr, tpr))
    
    # Average metrics for final classification report
    avg_class_report = {key: np.mean(value) for key, value in class_report_dict.items()}
    final_classification_report = classification_report(target, pipeline.predict(features), output_dict=True)
    
    # Save results for this model
    model_results[model_name] = {
        'accuracy': sum(accuracies) / len(accuracies),
        'confusion_matrix': sum(confusion_matrices).tolist(),
        'roc_data': roc_data,
        'classification_report': final_classification_report  # Save the final classification report
    }
    joblib.dump(pipeline, f'rain_prediction_pipeline_{model_name}.pkl')
# Save the model results
joblib.dump(model_results, 'rain_prediction_model_results.pkl')


Training RandomForest...
Training GradientBoosting...
Training LogisticRegression...
Training KNeighbors...
Training SVC...
Training DecisionTree...


['rain_prediction_model_results.pkl']