In [59]:
import pandas as pd

Loading and Exploring the Dataset

In [60]:
rainfall_data = pd.read_csv('rainfall_area-wt_India_1901-2015.csv')

In [61]:
rainfall_data.head()

Unnamed: 0,REGION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec
0,INDIA,1901,34.7,37.7,18.0,39.3,50.8,113.4,242.2,272.9,124.4,52.7,38.0,8.3,1032.3,72.4,108.1,752.8,99.0
1,INDIA,1902,7.4,4.3,19.0,43.5,48.3,108.8,284.0,199.7,201.5,61.5,27.9,24.4,1030.2,11.7,110.8,794.0,113.8
2,INDIA,1903,17.0,8.3,31.3,17.1,59.5,118.3,297.0,270.4,199.1,117.9,36.9,17.7,1190.5,25.3,107.9,884.8,172.5
3,INDIA,1904,14.4,9.6,31.8,33.1,72.4,164.8,261.0,206.4,129.6,69.0,11.2,16.3,1019.8,24.0,137.4,761.8,96.6
4,INDIA,1905,25.3,20.9,42.7,33.7,55.7,93.3,252.8,200.8,178.4,51.4,9.7,10.5,975.3,46.2,132.2,725.4,71.6


Analyzing Annual Rainfall Trends Over Time

In [62]:
import plotly.graph_objects as go
import plotly.express as px

# analyze trends in annual rainfall over time
annual_rainfall = rainfall_data[['YEAR', 'ANNUAL']]

fig_annual = go.Figure()
fig_annual.add_trace(go.Scatter(
    x=annual_rainfall['YEAR'],
    y=annual_rainfall['ANNUAL'],
    mode='lines',
    name='Annual Rainfall',
    line=dict(color='blue', width=2),
    opacity=0.7
))

fig_annual.add_trace(go.Scatter(
    x=annual_rainfall['YEAR'],
    y=[annual_rainfall['ANNUAL'].mean()] * len(annual_rainfall),
    mode='lines',
    name='Mean Rainfall',
    line=dict(color='red', dash='dash')
))

fig_annual.update_layout(
    title='Trend in Annual Rainfall in India (1901-2015)',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    template='plotly_white',
    legend=dict(title='Legend'),
    height=500
)
fig_annual

In [63]:
# identify months with the highest and lowest rainfall on average
monthly_columns = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP',
                    'OCT', 'NOV', 'DEC']
monthly_avg = rainfall_data[monthly_columns].mean()

highest_rainfall_month = monthly_avg.idxmax()
lowest_raingall_month  = monthly_avg.idxmin()
monthly_avg

JAN     19.759130
FEB     23.434783
MAR     28.254783
APR     38.241739
MAY     62.193913
JUN    168.360000
JUL    291.022609
AUG    258.400870
SEP    172.473043
OCT     75.701739
NOV     29.205217
DEC     14.980000
dtype: float64

In [64]:
fig_monthly = px.bar(
    x=monthly_avg.index,
    y=monthly_avg.values,
    labels={'x': 'Month', 'y': 'Rainfall (mm)'},
    title='Average Monthly Rainfall in India (1901-2015)',
    text=monthly_avg.values
)
fig_monthly.add_hline(
    y=monthly_avg.mean(),
    line_dash='dash',
    line_color='red',
    annotation_text='Mean Rainfall',
    annotation_position='top right'
)
fig_monthly.update_traces(marker_color='skyblue', marker_line_color='black', marker_line_width=2)
fig_monthly.update_layout(template='plotly_white', height=500)
fig_monthly

In [65]:
# Seasonal rainfall distribution
seasonal_columns = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
seasonal_avg = rainfall_data[seasonal_columns].mean()
seasonal_avg

Jan-Feb     43.189565
Mar-May    128.694783
Jun-Sep    890.260870
Oct-Dec    119.882609
dtype: float64

In [66]:
fig_seasonal = px.bar(
    x=seasonal_avg.index,
    y=seasonal_avg.values,
    labels={'x': 'Season', 'y': 'Rainfall (mm)'},
    title='Seasonal Rainfall Distribution in India (1901-2015)',
    text=seasonal_avg.values,
    color=seasonal_avg.values,
    color_continuous_scale=['gold', 'skyblue', 'green', 'orange']
)
fig_seasonal.add_hline(
    y=seasonal_avg.mean(),
    line_dash='dash',
    line_color='red',
    annotation_text = 'Mean Seasonal Rainfall',
    annotation_position = 'top left'
)

fig_seasonal.update_traces(marker_line_color='black', marker_line_width=1)
fig_seasonal.update_layout(
    template='plotly_white',
    height=500,
    coloraxis_colorbar=dict(title='mm')
)
fig_seasonal

Assessing the Impact of Climate Change in the Rainfall Trends in India

In [67]:
# Calculating rolling averages to assess climate change impact
rainfall_data['10-Year Rolling Avg'] = rainfall_data['ANNUAL'].rolling(window=10).mean()
rainfall_data['10-Year Rolling Avg']

0          NaN
1          NaN
2          NaN
3          NaN
4          NaN
        ...   
110    1121.61
111    1136.88
112    1141.05
113    1137.29
114    1123.36
Name: 10-Year Rolling Avg, Length: 115, dtype: float64

In [68]:
fig_climate_change = go.Figure()

fig_climate_change.add_trace(go.Scatter(
    x=rainfall_data['YEAR'],
    y=rainfall_data['ANNUAL'],
    mode='lines',
    name='Annual Rainfall',
    line=dict(color='blue', width=3),
    opacity=0.6
))

fig_climate_change.add_trace(go.Scatter(
    x=rainfall_data['YEAR'],
    y=rainfall_data['10-Year Rolling Avg'],
    mode='lines',
    name='10-Year Rolling Avg',
    line=dict(color='red', width=3)
))

fig_climate_change.update_layout(
    title='Impact of Climate Change on Rainfall Patterns (1901-2015)',
    xaxis_title = 'Year',
    yaxis_title = 'Rainfall (mm)',
    template='plotly_white',
    legend=dict(title='Legend'),
    height=500
)

fig_climate_change

Using statistical thresholds, to identify years with extreme or deficient rainfall(1.5 standard deviations below or above the mean).

In [69]:
from scipy.stats import pearsonr

# Identifyig drought and extreme rainfall years
mean_rainfall = rainfall_data['ANNUAL'].mean()
std_dev_rainfall = rainfall_data['ANNUAL'].std()

drought_years = rainfall_data[rainfall_data['ANNUAL'] < (mean_rainfall - 1.5 * std_dev_rainfall)]
extreme_rainfall_years = rainfall_data[rainfall_data['ANNUAL'] > (mean_rainfall + 1.5 * std_dev_rainfall)]

# correlating seasonal rainfall with annual rainfall totals
seasonal_columns = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
seasonal_correlations = {
    season: pearsonr(rainfall_data[season], rainfall_data['ANNUAL'])[0] for season in seasonal_columns
}

# displaying results for drought/extreme years and correlations
drought_years_summary = drought_years[['YEAR', 'ANNUAL']].reset_index(drop=True)
extreme_rainfall_years_summary = extreme_rainfall_years[['YEAR', 'ANNUAL']].reset_index(drop=True)
seasonal_correlations_summary = pd.DataFrame.from_dict(seasonal_correlations, orient='index', columns=['Correlation'])

drought_years_summary

Unnamed: 0,YEAR,ANNUAL
0,1905,975.3
1,1965,938.4
2,1972,948.5
3,2002,920.8
4,2009,959.3


In [70]:
extreme_rainfall_years_summary

Unnamed: 0,YEAR,ANNUAL
0,1917,1480.3
1,1933,1393.5
2,1956,1386.2
3,1959,1382.1
4,1961,1403.0
5,1988,1351.0
6,1990,1400.6


In [71]:
seasonal_correlations_summary

Unnamed: 0,Correlation
Jan-Feb,0.228913
Mar-May,0.313057
Jun-Sep,0.930027
Oct-Dec,0.531648


Detecting Anomalies in the Rainfall Trends in India using Isolation Forest

In [72]:
from sklearn.ensemble import IsolationForest

In [73]:
# Detect anomalous rainfall years based on annual data
isolation_forest = IsolationForest(contamination=0.05, random_state=42)
rainfall_data['Annual_Anomaly'] = isolation_forest.fit_predict(rainfall_data[['ANNUAL']])
rainfall_data['Annual_Anomaly']

0      1
1      1
2      1
3      1
4     -1
      ..
110    1
111    1
112    1
113    1
114    1
Name: Annual_Anomaly, Length: 115, dtype: int32

In [74]:
# Identify anomalies in annual rainfall
annual_anomalies = rainfall_data[rainfall_data['Annual_Anomaly'] == -1]
annual_anomalies

Unnamed: 0,REGION,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,...,OCT,NOV,DEC,ANNUAL,Jan-Feb,Mar-May,Jun-Sep,Oct-Dec,10-Year Rolling Avg,Annual_Anomaly
4,INDIA,1905,25.3,20.9,42.7,33.7,55.7,93.3,252.8,200.8,...,51.4,9.7,10.5,975.3,46.2,132.2,725.4,71.6,,-1
16,INDIA,1917,8.7,38.7,22.8,43.2,75.0,231.8,285.2,296.5,...,158.8,28.2,10.3,1480.3,47.3,141.1,1094.5,197.3,1181.84,-1
64,INDIA,1965,10.9,26.0,26.4,43.6,51.2,115.8,269.2,192.3,...,33.5,17.4,21.1,938.4,36.9,121.2,708.4,72.0,1240.96,-1
71,INDIA,1972,9.7,27.1,21.0,36.9,55.6,123.0,205.5,221.8,...,66.0,30.3,22.3,948.5,36.8,113.6,679.5,118.6,1121.14,-1
101,INDIA,2002,16.8,21.0,22.9,38.9,57.7,170.1,138.9,246.2,...,54.4,14.7,5.2,920.8,37.8,119.5,689.2,74.3,1152.43,-1
108,INDIA,2009,11.8,13.2,15.2,26.0,56.6,86.5,283.7,191.9,...,70.1,53.2,10.8,959.3,25.0,97.8,702.4,134.1,1103.92,-1


In [75]:
# Detect anomalous months based on monthly data
monthly_data = rainfall_data[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN',
                              'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']]
month_anomalies = isolation_forest.fit_predict(monthly_data)
month_anomalies

array([ 1,  1,  1,  1,  1,  1, -1,  1,  1,  1, -1,  1,  1,  1,  1,  1, -1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [76]:
# Add anomaly detection results for months
rainfall_data['Monthly_Anomaly'] = month_anomalies
month_anomalies_df = rainfall_data[rainfall_data['Monthly_Anomaly'] == -1][['YEAR'] + monthly_columns]
month_anomalies_df

Unnamed: 0,YEAR,JAN,FEB,MAR,APR,MAY,JUN,JUL,AUG,SEP,OCT,NOV,DEC
6,1907,16.2,46.0,37.8,62.8,32.6,154.4,225.4,310.4,96.9,22.7,22.5,12.1
10,1911,45.7,5.6,49.9,22.8,47.6,191.9,162.7,213.5,182.3,70.6,42.8,12.0
16,1917,8.7,38.7,22.8,43.2,75.0,231.8,285.2,296.5,281.0,158.8,28.2,10.3
17,1918,12.2,4.4,41.6,38.8,102.8,212.6,183.8,242.7,109.7,20.0,41.1,16.4
66,1967,11.2,13.4,63.3,29.1,42.4,144.9,304.6,262.9,170.4,40.3,11.4,54.4
89,1990,14.9,44.3,53.3,42.0,114.5,194.0,286.7,293.2,196.6,103.2,29.5,28.4


In [77]:
fig_annual_anomalies = go.Figure()

fig_annual_anomalies.add_trace(go.Scatter(
    x=rainfall_data['YEAR'],
    y=rainfall_data['ANNUAL'],
    mode='lines',
    name='Annual Rainfall',
    line=dict(color='blue', width=2),
    opacity=0.6
))

fig_annual_anomalies.add_trace(go.Scatter(
    x=annual_anomalies['YEAR'],
    y=annual_anomalies['ANNUAL'],
    mode='markers',
    name='Anomalous Years',
    marker=dict(color='red', size=8, symbol='circle')
))

fig_annual_anomalies.add_hline(
    y=rainfall_data['ANNUAL'].mean(),
    line_dash='dash',
    line_color='green',
    annotation_text='Mean Rainfall',
    annotation_position='top left'
)

fig_annual_anomalies.update_layout(
    title='Annual Rainfall Anomalies in India (1901-2015)',
    xaxis_title='Year',
    yaxis_title='Raindall (mm)',
    template='plotly_white',
    legend=dict(title='Legend'),
    height=500
)

fig_annual_anomalies

Identifying anomalies in monthly rainfall

In [78]:
# Preparing data for monthly anomalies
month_anomalies = []
for column in monthly_columns:
    for _, row in month_anomalies_df.iterrows():
        month_anomalies.append({'Year': row['YEAR'], 'Month': column, 'Rainfall': row[column]})

month_anomalies_df_long = pd.DataFrame(month_anomalies)
month_anomalies_df_long

Unnamed: 0,Year,Month,Rainfall
0,1907.0,JAN,16.2
1,1911.0,JAN,45.7
2,1917.0,JAN,8.7
3,1918.0,JAN,12.2
4,1967.0,JAN,11.2
...,...,...,...
67,1911.0,DEC,12.0
68,1917.0,DEC,10.3
69,1918.0,DEC,16.4
70,1967.0,DEC,54.4


In [79]:
fig_monthly_anomalies = px.line(
    rainfall_data,
    x='YEAR',
    y=monthly_columns,
    labels={'YEAR': 'Year', 'value': 'Rainfall (mm)', 'variable': 'Month'},
    title='Monthly Rainfall Anomalies in India (1901-2015)',
    color_discrete_sequence=px.colors.qualitative.Set3
)

fig_monthly_anomalies.add_trace(go.Scatter(
    x=month_anomalies_df_long['Year'],
    y=month_anomalies_df_long['Rainfall'],
    mode='markers',
    name='Anomalous Months',
    marker=dict(color='red', size=5, symbol='circle')
))

fig_monthly_anomalies.update_layout(
    template='plotly_white',
    legend=dict(title='Legent'),
    height=500
)

fig_monthly_anomalies

Correlating Seasonal Rainfall with Annual Totals

In [80]:
# Correlation analysis between monsoon (Jun-Sep) rainfall and other seasons
monsoon_column = 'Jun-Sep'
relationships = {}

for season in seasonal_columns:
    if season != monsoon_column:
        corr, _ = pearsonr(rainfall_data[monsoon_column], rainfall_data[season])
        relationships[season] = corr

correlation_data = pd.DataFrame({
    'Season': list(relationships.keys()),
    'Correlation Coefficient': list(relationships.values())
})
correlation_data

Unnamed: 0,Season,Correlation Coefficient
0,Jan-Feb,0.142731
1,Mar-May,0.10434
2,Oct-Dec,0.28652


In [81]:
fig = px.bar(
    correlation_data,
    x='Season',
    y='Correlation Coefficient',
    title='Correlation Betwee Monsoon (Jun-Sep) Rainfall and Other Seasons',
    labels={'Season': 'Season', 'Correlation Coefficient': 'Correlation Coefficient'},
    text='Correlation Coefficient',
    color='Correlation Coefficient',
    color_continuous_scale='Blues'
)

fig.add_hline(
    y=0,
    line_dash='dash',
    line_color='red',
    annotation_text ='No Correlation',
    annotation_position='bottom left'
)

fig.update_traces(marker_line_color='black', marker_line_width=1, texttemplate='%{text:.2f}')
fig.update_layout(
    template='plotly_white',
    height=500
)
fig

Grouping Years Based on Rainfall Patterns: Now , by apllying k-means clustering, we will group years into three categories: Dry, Normal and Wet, based on rainfall patterns

In [82]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [83]:
# Preparing data for clustering
rainfall_features = rainfall_data[['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec', 'ANNUAL']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rainfall_features)
scaled_features

array([[ 2.02663664e+00, -9.03463299e-01, -1.54814886e+00,
        -6.46007264e-01, -1.35870651e+00],
       [-2.18476401e+00, -7.85018207e-01, -1.08413511e+00,
        -1.88166596e-01, -1.37776209e+00],
       [-1.24118825e+00, -9.12237009e-01, -6.15028772e-02,
         1.62772848e+00,  7.68141228e-02],
       [-1.33138299e+00,  3.81885285e-01, -1.44678663e+00,
        -7.20251697e-01, -1.47213260e+00],
       [ 2.08865684e-01,  1.53768813e-01, -1.85674052e+00,
        -1.49363120e+00, -1.87592949e+00],
       [ 1.99888441e+00, -1.93876113e+00,  6.80155226e-02,
        -8.84208152e-01, -3.44223534e-01],
       [ 1.31201676e+00,  1.93250510e-01, -1.16184615e+00,
        -1.93600428e+00, -1.29155826e+00],
       [-2.49046084e-01, -1.81154233e+00,  3.00022395e-01,
        -2.07830611e+00, -7.75242688e-01],
       [-1.31099113e-01, -2.80529856e-01,  2.92138667e-01,
        -1.28636550e+00, -2.17186310e-01],
       [-1.14405545e+00, -1.93876113e+00,  5.63564183e-01,
         8.94564713e-01

In [84]:
# Perform k-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
rainfall_data['Rainfall_Cluster'] = kmeans.fit_predict(scaled_features)
rainfall_data['Rainfall_Cluster']

0      1
1      1
2      2
3      1
4      1
      ..
110    1
111    1
112    2
113    1
114    1
Name: Rainfall_Cluster, Length: 115, dtype: int32

In [91]:
# Map cluster labels to categories(Dry, Wet, Normal)
cluster_labels = {0: 'Dry', 1: 'Normal', 2: 'Wet'}
rainfall_data['Rainfall_Category'] = rainfall_data['Rainfall_Cluster'].map(cluster_labels)
rainfall_data['Rainfall_Category']


0      Normal
1      Normal
2         Wet
3      Normal
4      Normal
        ...  
110    Normal
111    Normal
112       Wet
113    Normal
114    Normal
Name: Rainfall_Category, Length: 115, dtype: object

In [92]:
fig = px.scatter(
    rainfall_data,
    x='YEAR',
    y='ANNUAL',
    color='Rainfall_Category',
    title='Clustering of Years Based on Rainfall Patterns',
    labels={'YEAR':'Year','ANNUAL': 'Annual Rainfall (mm)', 'Rainfall_Category': 'Rainfall Category'},
    color_discrete_sequence=px.colors.qualitative.Set2,
    hover_data={'Rainfall_Cluster': True, 'Rainfall_Category': True}
)

fig.update_layout(
    template='plotly_white',
    legend_title='Rainfall Category',
    height=500
)
fig

Forecasting Future Rainfall

In [93]:
rainfall_data['DATE'] = pd.to_datetime(rainfall_data['YEAR'], format='%Y')
annual_rainfall_ts = rainfall_data.set_index('DATE')['ANNUAL']

In [94]:
from prophet import Prophet

In [96]:
# Prepare data for Prophet
prophet_data = annual_rainfall_ts.reset_index()
prophet_data.columns = ['ds', 'y']
prophet_data

Unnamed: 0,ds,y
0,1901-01-01,1032.3
1,1902-01-01,1030.2
2,1903-01-01,1190.5
3,1904-01-01,1019.8
4,1905-01-01,975.3
...,...,...
110,2011-01-01,1110.1
111,2012-01-01,1073.5
112,2013-01-01,1216.2
113,2014-01-01,1033.7


In [97]:
from prophet.plot import plot_plotly, plot_components_plotly

prophet_model = Prophet()
prophet_model.fit(prophet_data)

15:47:23 - cmdstanpy - INFO - Chain [1] start processing
15:47:23 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x290c4fe1e50>

In [100]:
# Create a future dataframe for the next 20 years
future = prophet_model.make_future_dataframe(periods=20, freq='YE')
forecast = prophet_model.predict(future)
future, forecast

(            ds
 0   1901-01-01
 1   1902-01-01
 2   1903-01-01
 3   1904-01-01
 4   1905-01-01
 ..         ...
 130 2030-12-31
 131 2031-12-31
 132 2032-12-31
 133 2033-12-31
 134 2034-12-31
 
 [135 rows x 1 columns],
             ds        trend   yhat_lower   yhat_upper  trend_lower  \
 0   1901-01-01  1040.639058  1016.564084  1278.672110  1040.639058   
 1   1902-01-01  1042.630915  1012.660933  1287.024153  1042.630915   
 2   1903-01-01  1044.622773  1016.439273  1286.235587  1044.622773   
 3   1904-01-01  1046.614630  1013.483853  1270.333157  1046.614630   
 4   1905-01-01  1048.611945  1024.677892  1293.398462  1048.611945   
 ..         ...          ...          ...          ...          ...   
 130 2030-12-31  1007.262574   964.011802  1227.505317  1003.840655   
 131 2031-12-31  1005.928207   955.121219  1229.687148  1002.126711   
 132 2032-12-31  1004.590185   969.007124  1229.712634  1000.398471   
 133 2033-12-31  1003.255819   961.646932  1235.703181   998.635704   


In [102]:
fig_forecast = plot_plotly(prophet_model, forecast)

fig_forecast.update_layout(
    title='Annual Rainfall Forecast Using Prophet',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    template='plotly_white',
    height=500
)

fig_forecast

The blue line represents the model’s forecast trend, while the shaded area indicates the confidence interval. The trend reveals a slight decline in annual rainfall over time, with notable year-to-year variability (black dots representing actual data points). The model captures the variability well but highlights that future rainfall may continue to slightly decrease, which emphasizes the need for adaptive strategies to manage potential water resource challenges.

Conclusion:
The analysis of India’s rainfall trends and patterns from 1901 to 2015 reveals significant variability in annual and seasonal rainfall, with the monsoon season (June-September) being the dominant contributor. Anomalous years of extreme drought and wetness highlight the unpredictability of rainfall, while clustering shows a shift towards more dry years in recent decades. Correlations indicate the limited dependency of non-monsoon seasons on monsoon rainfall. A time-series forecast using Prophet suggests a slight declining trend in annual rainfall, which emphasises the need for long-term water resource planning and adaptation to changing climate patterns.