<a href="https://colab.research.google.com/github/DeepikaBantu/Project-01/blob/main/RFP_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from google.colab import files

uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Load CSV with semicolon separator
df = pd.read_csv(file_name, sep=';')

# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# Display first few rows
df.head()


Saving Indian Rainfall Dataset District-wise Daily Measurements.csv to Indian Rainfall Dataset District-wise Daily Measurements (1).csv


Unnamed: 0,state,district,month,1st,2nd,3rd,4th,5th,6th,7th,...,22nd,23rd,24th,25th,26th,27th,28th,29th,30th,31st
0,Andaman & Nicobar,Nicobars,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Andaman & Nicobar,North And Middle Andaman,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Andaman & Nicobar,South Andamans,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Andhra Pradesh,Anantapur,1,0.379965,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.205834,0.671002,0.028994,0.0,0.0,0.0
4,Andhra Pradesh,Chittoor,1,14.1449,0.857263,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.615237,0.925098,0.507623,0.424843,0.737394,0.0,0.0,0.0


In [4]:
# List of day columns
day_cols = [str(i)+'st' if i==1 else str(i)+'nd' if i==2 else str(i)+'rd' if i==3 else str(i)+'th' for i in range(1,32)]

# Keep only day columns that exist in df.columns
day_cols = [col for col in day_cols if col in df.columns]

# Melt the dataframe
df_long = df.melt(id_vars=['state', 'district', 'month'],
                  value_vars=day_cols,
                  var_name='Day',
                  value_name='RainfallToday')

# Drop rows with NaN rainfall
df_long = df_long.dropna(subset=['RainfallToday']).reset_index(drop=True)

df_long.head()


Unnamed: 0,state,district,month,Day,RainfallToday
0,Andaman & Nicobar,Nicobars,1,1st,0.0
1,Andaman & Nicobar,North And Middle Andaman,1,1st,0.0
2,Andaman & Nicobar,South Andamans,1,1st,0.0
3,Andhra Pradesh,Anantapur,1,1st,0.379965
4,Andhra Pradesh,Chittoor,1,1st,14.1449


In [5]:
# Step 3: Create Target Column and Prepare Features

import numpy as np

# Sort the data by district and month to ensure correct shift
df_long = df_long.sort_values(by=['state', 'district', 'month', 'Day']).reset_index(drop=True)

# Create 'RainTomorrow' by shifting 'RainfallToday' within each district
df_long['RainTomorrow'] = df_long.groupby(['state', 'district'])['RainfallToday'].shift(-1)

# Convert to binary: 1 if rain > 0, else 0
df_long['RainTomorrow'] = (df_long['RainTomorrow'] > 0).astype(int)

# Drop rows where 'RainTomorrow' is NaN (last day of each district)
df_long = df_long.dropna(subset=['RainTomorrow']).reset_index(drop=True)

# Select features - you can add other columns if available like Temperature, Humidity, etc.
# For now, we only have RainfallToday as numeric feature
X = df_long[['RainfallToday']]
y = df_long['RainTomorrow']

# Display first few rows
print("✅ Features (X) and Target (y) ready for modeling:")
print(X.head())
print("\nTarget values (y):")
print(y.head())


✅ Features (X) and Target (y) ready for modeling:
   RainfallToday
0            0.0
1            0.0
2            0.0
3            0.0
4            0.0

Target values (y):
0    0
1    0
2    0
3    0
4    0
Name: RainTomorrow, dtype: int64


In [6]:
# Step 4: Train ML Models (Random Forest + XGBoost)

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Evaluate models
for name, model in [("Random Forest", rf_model), ("XGBoost", xgb_model)]:
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {acc:.2f}")
    print(classification_report(y_test, y_pred))

# Save models
joblib.dump(rf_model, "rf_model_imd.pkl")
joblib.dump(xgb_model, "xgb_model_imd.pkl")

print("\n✅ Models trained and saved successfully!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Random Forest Accuracy: 0.77
              precision    recall  f1-score   support

           0       0.75      0.87      0.81     25975
           1       0.81      0.65      0.72     21491

    accuracy                           0.77     47466
   macro avg       0.78      0.76      0.76     47466
weighted avg       0.78      0.77      0.77     47466


XGBoost Accuracy: 0.81
              precision    recall  f1-score   support

           0       0.83      0.83      0.83     25975
           1       0.79      0.79      0.79     21491

    accuracy                           0.81     47466
   macro avg       0.81      0.81      0.81     47466
weighted avg       0.81      0.81      0.81     47466


✅ Models trained and saved successfully!


In [7]:
# Step 5: Feature Engineering from Rainfall

import numpy as np
import pandas as pd

# Ensure dataset is sorted by state, district, month, day
df_long = df_long.sort_values(by=['state', 'district', 'month', 'Day']).reset_index(drop=True)

# 1. Previous day rainfall (lag-1)
df_long['Rainfall_lag1'] = df_long.groupby(['state', 'district'])['RainfallToday'].shift(1)

# 2. Previous 2-day average rainfall
df_long['Rainfall_lag2avg'] = df_long.groupby(['state', 'district'])['RainfallToday'].shift(1).rolling(2).mean().reset_index(0, drop=True)

# 3. Previous 3-day average rainfall
df_long['Rainfall_lag3avg'] = df_long.groupby(['state', 'district'])['RainfallToday'].shift(1).rolling(3).mean().reset_index(0, drop=True)

# 4. Encode month as categorical variable
df_long['Month'] = df_long['month'].astype(str)
df_long = pd.get_dummies(df_long, columns=['Month'], drop_first=True)

# Drop rows with NaN created by lag/rolling features
df_long = df_long.dropna().reset_index(drop=True)

# Features: RainfallToday + lag features + month dummies
feature_cols = ['RainfallToday', 'Rainfall_lag1', 'Rainfall_lag2avg', 'Rainfall_lag3avg'] + [col for col in df_long.columns if 'Month_' in col]
X = df_long[feature_cols]
y = df_long['RainTomorrow']

# Check first few rows
print("✅ Engineered features ready for modeling:")
print(X.head())
print("\nTarget values (y):")
print(y.head())


✅ Engineered features ready for modeling:
   RainfallToday  Rainfall_lag1  Rainfall_lag2avg  Rainfall_lag3avg  Month_10  \
0            0.0            0.0               0.0               0.0      True   
1            0.0            0.0               0.0               0.0      True   
2            0.0            0.0               0.0               0.0      True   
3            0.0            0.0               0.0               0.0      True   
4            0.0            0.0               0.0               0.0      True   

   Month_11  Month_12  Month_2  Month_3  Month_4  Month_5  Month_6  Month_7  \
0     False     False    False    False    False    False    False    False   
1     False     False    False    False    False    False    False    False   
2     False     False    False    False    False    False    False    False   
3     False     False    False    False    False    False    False    False   
4     False     False    False    False    False    False    False    False 

In [8]:
# Step 6: Train ML Models with Engineered Features

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)

# Train XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Evaluate models
for name, model in [("Random Forest", rf_model), ("XGBoost", xgb_model)]:
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n{name} Accuracy: {acc:.2f}")
    print(classification_report(y_test, y_pred))

# Save models
joblib.dump(rf_model, "rf_model_imd_features.pkl")
joblib.dump(xgb_model, "xgb_model_imd_features.pkl")

print("\n✅ Models trained with engineered features and saved successfully!")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Random Forest Accuracy: 0.83
              precision    recall  f1-score   support

           0       0.84      0.85      0.85     25502
           1       0.82      0.81      0.81     21447

    accuracy                           0.83     46949
   macro avg       0.83      0.83      0.83     46949
weighted avg       0.83      0.83      0.83     46949


XGBoost Accuracy: 0.84
              precision    recall  f1-score   support

           0       0.85      0.87      0.86     25502
           1       0.84      0.81      0.82     21447

    accuracy                           0.84     46949
   macro avg       0.84      0.84      0.84     46949
weighted avg       0.84      0.84      0.84     46949


✅ Models trained with engineered features and saved successfully!


In [10]:
!pip install streamlit pyngrok


Collecting streamlit
  Downloading streamlit-1.50.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.50.0-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyngrok, pydeck, streamlit
Successfully installed pydeck-0.9.1 pyngrok-7.4.0 streamlit-1.50.0


In [11]:
# app.py

import streamlit as st
import pandas as pd
import numpy as np
import joblib

st.set_page_config(page_title="Rainfall Prediction System", page_icon="🌧️")
st.title("🌦️ Rainfall Prediction using ML (Random Forest & XGBoost)")

st.write("Enter today's rainfall and select the month to predict if it will rain tomorrow.")

# Load models
rf_model = joblib.load("rf_model_imd_features.pkl")
xgb_model = joblib.load("xgb_model_imd_features.pkl")

# Input fields
rain_today = st.number_input("🌧️ Rainfall Today (mm)", 0.0, 100.0, 10.0)
month = st.selectbox("📅 Month", list(range(1,13)))

# Simulate lag features (ask user for simplicity or assume previous rainfall)
rain_lag1 = st.number_input("🌦️ Rainfall Yesterday (lag-1, mm)", 0.0, 100.0, 5.0)
rain_lag2avg = st.number_input("🌦️ Average Rainfall Last 2 Days (mm)", 0.0, 100.0, 5.0)
rain_lag3avg = st.number_input("🌦️ Average Rainfall Last 3 Days (mm)", 0.0, 100.0, 5.0)

# Create month dummies
month_dummies = [0]*11  # drop_first=True used in training
if month != 1:
    month_dummies[month-2] = 1

# Combine features
X_input = [rain_today, rain_lag1, rain_lag2avg, rain_lag3avg] + month_dummies
X_input = np.array(X_input).reshape(1, -1)

# Model selection
model_choice = st.radio("Choose Model", ["Random Forest", "XGBoost"])

if st.button("🔍 Predict"):
    if model_choice == "Random Forest":
        pred = rf_model.predict(X_input)[0]
    else:
        pred = xgb_model.predict(X_input)[0]

    # Simple rainfall estimation
    rainfall_amount = np.random.uniform(0, 100) if pred == 1 else np.random.uniform(0, 10)

    st.subheader(f"🌤️ Prediction Result: {'Rain Tomorrow ☔' if pred==1 else 'No Rain 🌞'}")
    st.write(f"💦 Estimated Rainfall: **{rainfall_amount:.2f} mm**")

    if rainfall_amount > 50:
        st.error("⚠️ Heavy Rain Alert! Please take necessary precautions.")
    elif pred == 1:
        st.warning("🌧️ Light to Moderate Rain Expected.")
    else:
        st.success("🌞 Clear weather likely tomorrow.")


2025-10-05 15:31:21.745 
  command:

    streamlit run /usr/local/lib/python3.12/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-10-05 15:31:28.056 Session state does not function when running a script without `streamlit run`


In [13]:
%%writefile app.py
# Paste the code above here
# app.py

import streamlit as st
import numpy as np
import joblib

# Set page config
st.set_page_config(page_title="Rainfall Prediction System", page_icon="🌧️")
st.title("🌦️ Rainfall Prediction using ML (Random Forest & XGBoost)")

st.write("Enter today's rainfall and lag features to predict if it will rain tomorrow.")

# Load trained models
rf_model = joblib.load("rf_model_imd_features.pkl")
xgb_model = joblib.load("xgb_model_imd_features.pkl")

# Input fields
rain_today = st.number_input("🌧️ Rainfall Today (mm)", 0.0, 100.0, 10.0)
month = st.selectbox("📅 Month", list(range(1, 13)))

rain_lag1 = st.number_input("🌦️ Rainfall Yesterday (lag-1, mm)", 0.0, 100.0, 5.0)
rain_lag2avg = st.number_input("🌦️ Average Rainfall Last 2 Days (mm)", 0.0, 100.0, 5.0)
rain_lag3avg = st.number_input("🌦️ Average Rainfall Last 3 Days (mm)", 0.0, 100.0, 5.0)

# Create month dummy variables (drop_first=True style)
month_dummies = [0] * 11
if month != 1:
    month_dummies[month - 2] = 1

# Combine all features into one input array
X_input = [rain_today, rain_lag1, rain_lag2avg, rain_lag3avg] + month_dummies
X_input = np.array(X_input).reshape(1, -1)

# Model selection
model_choice = st.radio("Choose Model", ["Random Forest", "XGBoost"])

# Prediction button
if st.button("🔍 Predict"):
    if model_choice == "Random Forest":
        pred = rf_model.predict(X_input)[0]
    else:
        pred = xgb_model.predict(X_input)[0]

    # Simple rainfall estimation
    rainfall_amount = np.random.uniform(0, 100) if pred == 1 else np.random.uniform(0, 10)

    st.subheader(f"🌤️ Prediction Result: {'Rain Tomorrow ☔' if pred == 1 else 'No Rain 🌞'}")
    st.write(f"💦 Estimated Rainfall: **{rainfall_amount:.2f} mm**")

    if rainfall_amount > 50:
        st.error("⚠️ Heavy Rain Alert! Please take necessary precautions.")
    elif pred == 1:
        st.warning("🌧️ Light to Moderate Rain Expected.")
    else:
        st.success("🌞 Clear weather likely tomorrow.")



Writing app.py


In [14]:
!pip install streamlit pyngrok





In [16]:
!ngrok authtoken 33eYIyLi1LLeWxthzpGWCTAU2Q1_2HRYj2fJ8tmBB2ru4uSEx

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [18]:
from pyngrok import ngrok
ngrok.kill()


In [19]:
# Open a new HTTP tunnel for port 8501 (Streamlit)
public_url = ngrok.connect(addr=8501, proto="http")
print("🌐 Your Streamlit app is live at:", public_url)


🌐 Your Streamlit app is live at: NgrokTunnel: "https://flexural-overpessimistic-isaac.ngrok-free.dev" -> "http://localhost:8501"


In [None]:
get_ipython().system_raw("streamlit run app.py &")
