<a href="https://colab.research.google.com/github/2303a51295madhuri/python-for-data-science/blob/main/project18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    xgb_available = True
except:
    xgb_available = False

# Load CSV files
district_path = "/content/district wise rainfall normal.csv"
hist_path = "/content/rainfall in india 1901-2015.csv"

df_district = pd.read_csv(district_path, encoding='latin1')
df_hist = pd.read_csv(hist_path, encoding='latin1')

# Normalize column names
df_district.columns = [c.strip() for c in df_district.columns]
df_hist.columns = [c.strip() for c in df_hist.columns]

# Extract monthly columns
months = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
existing_months = [m for m in months if m in df_district.columns]

feat = df_district[['STATE_UT_NAME','DISTRICT']].copy()
for m in existing_months:
    feat[f"{m}_norm"] = df_district[m].astype(float)

# Seasonal aggregates if available
for s in ['Jan-Feb','Mar-May','Jun-Sep','Oct-Dec','ANNUAL']:
    if s in df_district.columns:
        feat[s.replace('-','_').lower()] = pd.to_numeric(df_district[s], errors='coerce')

# Statistical features
feat['month_mean_norm'] = feat[[f"{m}_norm" for m in existing_months]].mean(axis=1)
feat['month_std_norm']  = feat[[f"{m}_norm" for m in existing_months]].std(axis=1)
feat['month_max_norm']  = feat[[f"{m}_norm" for m in existing_months]].max(axis=1)
feat['month_min_norm']  = feat[[f"{m}_norm" for m in existing_months]].min(axis=1)
if 'ANNUAL' in df_district.columns:
    feat['annual_norm'] = pd.to_numeric(df_district['ANNUAL'], errors='coerce')
else:
    feat['annual_norm'] = feat[[f"{m}_norm" for m in existing_months]].sum(axis=1)

# ---- Historical trend computation ----
if 'YEAR' in df_hist.columns:
    candidate_cols = [c for c in df_hist.columns if c != 'YEAR']
    district_names = df_district['DISTRICT'].astype(str).str.upper().str.strip().unique()
    matches = [c for c in candidate_cols if str(c).upper().strip() in district_names]
    if len(matches) > 0:
        years = pd.to_numeric(df_hist['YEAR'], errors='coerce').values
        slopes = {}
        for c in matches:
            series = pd.to_numeric(df_hist[c], errors='coerce').values
            mask = ~np.isnan(series) & ~np.isnan(years)
            slopes[str(c).strip()] = np.polyfit(years[mask], series[mask], 1)[0] if mask.sum() >= 10 else np.nan
        hist_df = pd.DataFrame.from_dict(slopes, orient='index', columns=['slope']).reset_index().rename(columns={'index':'DISTRICT'})
    else:
        hist_df = pd.DataFrame({'DISTRICT': df_district['DISTRICT'], 'slope': np.nan})
else:
    hist_df = pd.DataFrame({'DISTRICT': df_district['DISTRICT'], 'slope': np.nan})

hist_df['DISTRICT_upper'] = hist_df['DISTRICT'].astype(str).str.upper().str.strip()
feat['DISTRICT_upper'] = feat['DISTRICT'].astype(str).str.upper().str.strip()

# Merge
merged = feat.merge(hist_df[['DISTRICT_upper','slope']], on='DISTRICT_upper', how='left')

# Create target label
if merged['slope'].notna().sum() >= max(10, 0.25 * len(merged)):
    slope_q1 = merged['slope'].quantile(0.25)
    merged['target'] = np.where(merged['slope'] <= slope_q1, 1, 0)
else:
    std_q3 = merged['month_std_norm'].quantile(0.75)
    merged['target'] = np.where(merged['month_std_norm'] >= std_q3, 1, 0)

# Prepare X, y
drop_cols = ['DISTRICT','STATE_UT_NAME','DISTRICT_upper','target','slope']
X = merged.drop(columns=[c for c in drop_cols if c in merged.columns])
X = X.select_dtypes(include=[np.number])
y = merged['target'].astype(int)

mask = (~X.isna().any(axis=1)) & (~y.isna())
X = X[mask].reset_index(drop=True)
y = y[mask].reset_index(drop=True)

# Train-test split + scaling
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

print("Data ready. Train size:", X_train.shape, "Test size:", X_test.shape)


Data ready. Train size: (519, 22) Test size: (130, 22)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd

log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_s, y_train)
y_pred = log_reg.predict(X_test_s)

print("\n=== Logistic Regression Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Show districts
log_reg_results = pd.DataFrame({
    'DISTRICT': merged.loc[y_test.index, 'DISTRICT'].values,
    'STATE': merged.loc[y_test.index, 'STATE_UT_NAME'].values,
    'Actual': y_test.values,
    'Predicted': y_pred
})
print("\nSample Predictions (Logistic Regression):")
print(log_reg_results.head(10))



=== Logistic Regression Results ===
Accuracy: 0.9846153846153847
              precision    recall  f1-score   support

           0       0.98      1.00      0.99        97
           1       1.00      0.94      0.97        33

    accuracy                           0.98       130
   macro avg       0.99      0.97      0.98       130
weighted avg       0.98      0.98      0.98       130


Sample Predictions (Logistic Regression):
        DISTRICT                        STATE  Actual  Predicted
0     BONGAIGAON                        ASSAM       1          1
1  KARBI ANGLONG                        ASSAM       0          0
2          YANAM                  PONDICHERRY       0          0
3         KARGIL            JAMMU AND KASHMIR       0          0
4          MAMIT                      MIZORAM       1          1
5       JHALAWAR                    RAJASTHAN       0          0
6         SHIMLA                     HIMACHAL       0          0
7  SOUTH ANDAMAN  ANDAMAN And NICOBAR ISLAND

In [8]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)  # No scaling needed
y_pred = rf.predict(X_test)

print("\n=== Random Forest Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

rf_results = pd.DataFrame({
    'DISTRICT': merged.loc[y_test.index, 'DISTRICT'].values,
    'STATE': merged.loc[y_test.index, 'STATE_UT_NAME'].values,
    'Actual': y_test.values,
    'Predicted': y_pred
})
print("\nSample Predictions (Random Forest):")
print(rf_results.head(10))

# Feature importance
importances = rf.feature_importances_
fi = pd.DataFrame({'feature':X.columns, 'importance':importances}).sort_values('importance', ascending=False)
print("\nTop Features:")
print(fi.head(10))



=== Random Forest Results ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        97
           1       1.00      1.00      1.00        33

    accuracy                           1.00       130
   macro avg       1.00      1.00      1.00       130
weighted avg       1.00      1.00      1.00       130


Sample Predictions (Random Forest):
        DISTRICT                        STATE  Actual  Predicted
0     BONGAIGAON                        ASSAM       1          1
1  KARBI ANGLONG                        ASSAM       0          0
2          YANAM                  PONDICHERRY       0          0
3         KARGIL            JAMMU AND KASHMIR       0          0
4          MAMIT                      MIZORAM       1          1
5       JHALAWAR                    RAJASTHAN       0          0
6         SHIMLA                     HIMACHAL       0          0
7  SOUTH ANDAMAN  ANDAMAN And NICOBAR ISLANDS       1          1
8     

In [9]:
from sklearn.svm import SVC

svm_model = SVC(probability=True, kernel='rbf', random_state=42)
svm_model.fit(X_train_s, y_train)
y_pred = svm_model.predict(X_test_s)

print("\n=== SVM Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

svm_results = pd.DataFrame({
    'DISTRICT': merged.loc[y_test.index, 'DISTRICT'].values,
    'STATE': merged.loc[y_test.index, 'STATE_UT_NAME'].values,
    'Actual': y_test.values,
    'Predicted': y_pred
})
print("\nSample Predictions (SVM):")
print(svm_results.head(10))



=== SVM Results ===
Accuracy: 0.9615384615384616
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        97
           1       1.00      0.85      0.92        33

    accuracy                           0.96       130
   macro avg       0.98      0.92      0.95       130
weighted avg       0.96      0.96      0.96       130


Sample Predictions (SVM):
        DISTRICT                        STATE  Actual  Predicted
0     BONGAIGAON                        ASSAM       1          1
1  KARBI ANGLONG                        ASSAM       0          0
2          YANAM                  PONDICHERRY       0          0
3         KARGIL            JAMMU AND KASHMIR       0          0
4          MAMIT                      MIZORAM       1          1
5       JHALAWAR                    RAJASTHAN       0          0
6         SHIMLA                     HIMACHAL       0          0
7  SOUTH ANDAMAN  ANDAMAN And NICOBAR ISLANDS       1          1
8      LAKH

In [10]:
try:
    from xgboost import XGBClassifier
    xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb_model.fit(X_train, y_train)
    y_pred = xgb_model.predict(X_test)

    print("\n=== XGBoost Results ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

    xgb_results = pd.DataFrame({
        'DISTRICT': merged.loc[y_test.index, 'DISTRICT'].values,
        'STATE': merged.loc[y_test.index, 'STATE_UT_NAME'].values,
        'Actual': y_test.values,
        'Predicted': y_pred
    })
    print("\nSample Predictions (XGBoost):")
    print(xgb_results.head(10))
except ImportError:
    print("XGBoost is not installed in this environment.")



=== XGBoost Results ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        97
           1       1.00      1.00      1.00        33

    accuracy                           1.00       130
   macro avg       1.00      1.00      1.00       130
weighted avg       1.00      1.00      1.00       130


Sample Predictions (XGBoost):
        DISTRICT                        STATE  Actual  Predicted
0     BONGAIGAON                        ASSAM       1          1
1  KARBI ANGLONG                        ASSAM       0          0
2          YANAM                  PONDICHERRY       0          0
3         KARGIL            JAMMU AND KASHMIR       0          0
4          MAMIT                      MIZORAM       1          1
5       JHALAWAR                    RAJASTHAN       0          0
6         SHIMLA                     HIMACHAL       0          0
7  SOUTH ANDAMAN  ANDAMAN And NICOBAR ISLANDS       1          1
8      LAKHIMPUR  