In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

**Data Collection and Processing**

In [56]:
data = pd.read_csv("Rainfall.csv")

In [None]:
print(type(data))

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data["day"].unique()

In [None]:
data.info

In [None]:
print("Data Info:\n")
data.info()

In [None]:
data.columns

In [None]:
data.columns = data.columns.str.strip()

In [None]:
data.columns

In [None]:
print("Data Info:\n")
data.info()

In [68]:
data = data.drop(columns=["day"])

In [None]:
data.head()

In [None]:
print(data.isnull().sum())

In [None]:
data["winddirection"].unique()

In [72]:
data["winddirection"] = data["winddirection"].fillna(data["winddirection"].mode()[0])
data["windspeed"] = data["windspeed"].fillna(data["windspeed"].median())

In [None]:
print(data.isnull().sum())

In [None]:
data["rainfall"].unique()

In [75]:
data["rainfall"] = data["rainfall"].map({"yes": 1, "no": 0})

In [None]:
data.head()

**Exploratory Data Analysis (EDA)**

In [None]:
data.shape

In [78]:
sns.set(style="whitegrid")

In [None]:
data.describe()

In [None]:
data.columns

In [None]:
plt.figure(figsize=(15, 10))

for i, column in enumerate(['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity','cloud', 'sunshine', 'windspeed'], 1):
  plt.subplot(3, 3, i)
  sns.histplot(data[column], kde=True)
  plt.title(f"Distribution of {column}")

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x="rainfall", data=data)
plt.title("Distribution of Rainfall")
plt.show()

In [None]:
# correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation heatmap")
plt.show()

In [None]:
plt.figure(figsize=(15, 10))

for i, column in enumerate(['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity','cloud', 'sunshine', 'windspeed'], 1):
  plt.subplot(3, 3, i)
  sns.boxplot(data[column])
  plt.title(f"Boxplot of {column}")

plt.tight_layout()
plt.show()

**Data Preprocessing**

In [85]:
# drop highly correlated column
data = data.drop(columns=['maxtemp', 'temparature', 'mintemp'])

In [None]:
data.head()

In [None]:
print(data["rainfall"].value_counts())

In [None]:
df_majority = data[data["rainfall"] == 1]
df_minority = data[data["rainfall"] == 0]

In [None]:
print(df_majority.shape)
print(df_minority.shape)

In [None]:
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority), random_state=42)

In [None]:
df_majority_downsampled.shape

In [92]:
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

In [None]:
df_downsampled.shape

In [None]:
df_downsampled.head()

In [None]:
df_downsampled = df_downsampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
df_downsampled.head()

In [None]:
df_downsampled["rainfall"].value_counts()

In [None]:
X = df_downsampled.drop(columns=["rainfall"])
y = df_downsampled["rainfall"]

In [None]:
print(X)

In [None]:
print(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Model Training**

In [102]:
rf_model = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [50, 100, 200],
    "max_features": ["sqrt", "log2"],
    "max_depth": [None, 10, 20, 30],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2)

grid_search_rf.fit(X_train, y_train)

In [None]:
best_rf_model = grid_search_rf.best_estimator_

print("best parameters for Random Forest:", grid_search_rf.best_params_)

**Model Evaluation**

In [None]:
cv_scores = cross_val_score(best_rf_model, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))

In [None]:
y_pred = best_rf_model.predict(X_test)

print("Test set Accuracy:", accuracy_score(y_test, y_pred))
print("Test set Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

**save code in pickle**

In [None]:
model_data = {"model": best_rf_model, "feature_names": X.columns.tolist()}
with open("rainfall_prediction_model.pkl", "wb") as file: #rainfall_prediction_model is pickle file name
  pickle.dump(model_data, file)