In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pickle

In [2]:
# Step 1: Load the dataset
data = pd.read_csv('rainfall.csv')
print("Dataset preview:")
print(data.head())
print("Columns:", data.columns.tolist())
print("Missing values per column:")
print(data.isnull().sum())

Dataset preview:
   day  pressure   maxtemp  temparature  mintemp  dewpoint  humidity   cloud   \
0    1     1025.9     19.9         18.3     16.8      13.1         72      49   
1    2     1022.0     21.7         18.9     17.2      15.6         81      83   
2    3     1019.7     20.3         19.3     18.0      18.4         95      91   
3    4     1018.9     22.3         20.6     19.1      18.8         90      88   
4    5     1015.9     21.3         20.7     20.2      19.9         95      81   

  rainfall  sunshine           winddirection  windspeed  
0      yes       9.3                    80.0       26.3  
1      yes       0.6                    50.0       15.3  
2      yes       0.0                    40.0       14.2  
3      yes       1.0                    50.0       16.9  
4      yes       0.0                    40.0       13.7  
Columns: ['day', 'pressure ', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity ', 'cloud ', 'rainfall', 'sunshine', '         winddirectio

In [3]:
# Step 2: Preprocessing
# Assuming the target column is 'rainfall' (binary: 0 or 1 for no rain/rain)
# Replace 'rainfall' with your actual target column name
target_column = 'rainfall'

In [4]:
# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

In [5]:
# Check class distribution
print("Class distribution before SMOTE:")
print(y.value_counts())

Class distribution before SMOTE:
rainfall
yes    249
no     117
Name: count, dtype: int64


In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Step 3: Impute missing values
imputer = SimpleImputer(strategy='mean')  # Replace NaNs with column mean
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

In [8]:
# Step 4: Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


In [9]:
# Step 5: Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

Class distribution after SMOTE:
rainfall
no     198
yes    198
Name: count, dtype: int64


In [10]:
# Step 6: Dimensionality Reduction with PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train_smote)
X_test_pca = pca.transform(X_test_scaled)
print(f"Number of PCA components: {pca.n_components_}")
print(f"Explained variance ratio: {sum(pca.explained_variance_ratio_):.4f}")

Number of PCA components: 6
Explained variance ratio: 0.9730


In [11]:
# Step 7: Model Selection with Hyperparameter Tuning
model = LogisticRegression(random_state=42, max_iter=1000)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}

In [12]:
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_pca, y_train_smote)


In [13]:
# Best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation accuracy:", grid_search.best_score_)

Best parameters: {'C': 0.1, 'solver': 'lbfgs'}
Best cross-validation accuracy: 0.8309810126582278


In [15]:
# Step 8: Evaluate the model on test set
y_pred = best_model.predict(X_test_pca)
print("Test set accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Test set accuracy: 0.7702702702702703
Classification Report:
               precision    recall  f1-score   support

          no       0.64      0.61      0.62        23
         yes       0.83      0.84      0.83        51

    accuracy                           0.77        74
   macro avg       0.73      0.73      0.73        74
weighted avg       0.77      0.77      0.77        74



In [16]:
# Step 9: Save the pipeline components
with open('imputer.pkl', 'wb') as f:
    pickle.dump(imputer, f)
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('pca.pkl', 'wb') as f:
    pickle.dump(pca, f)
with open('rainfall_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("Model and preprocessing components saved successfully!")

Model and preprocessing components saved successfully!


In [17]:
# Step 10: Prediction function for new data
def predict_rainfall(new_data):
    # new_data should be a DataFrame with the same feature columns as X
    new_data_imputed = imputer.transform(new_data)
    new_data_scaled = scaler.transform(new_data_imputed)
    new_data_pca = pca.transform(new_data_scaled)
    prediction = best_model.predict(new_data_pca)
    return prediction