### Import Required Libraries

In [217]:
# Libraries to process file and visualization for EDA
import numpy as np
import pandas as pd
import matplotlib

In [None]:
# Library version
print("Numpy Version: ", np.__version__)
print("Pandas Version: ", pd.__version__)
print("Matplotlib Version: ", matplotlib.__version__)

Numpy Version:  2.3.3
Pandas Version:  2.3.2
Matplotlib Version:  3.10.6


### Import Dataset

In [219]:
# Import dataset file and get panda dataframe. Also copy original dataframe to future reference.
dataset = pd.read_csv("Gujarat_Rainfall.csv")
dataset_original = dataset.copy()

### Summary of Dataset
This dataset is a comprehensive collection of historical weather data for `Gujarat` state in India. It includes information on the amount of rainfall recorded each day from the `8th DEC 2015` to the `10th FEB 2025`, allowing for a detailed analysis of precipitation patterns across different districts in Gujarat. This dataset contains 15 columns such as date of record, month, season, station name (exact location), state, district, average temperature, minimum temperature, maximum temperature, wind speed, air pressure, elevation, latitude, longitude, rainfall in mm.

In [220]:
# check total number of rows and columns
dataset.shape

(61988, 15)

In [221]:
# Check first five rows 
dataset.head()

Unnamed: 0,date_of_record,month,season,station_name,state,district,avg_temp,min_temp,max_temp,wind_speed,air_pressure,elevation,latitude,longitude,rainfall
0,2015-12-08,December,Winter,Deesa,GJ,Banaskantha,26.4,14.6,32.7,,,135,24.2,72.2,
1,2015-12-09,December,Winter,Deesa,GJ,Banaskantha,25.6,15.6,30.0,,,135,24.2,72.2,
2,2015-12-10,December,Winter,Deesa,GJ,Banaskantha,25.2,16.6,30.0,,,135,24.2,72.2,
3,2015-12-11,December,Winter,Deesa,GJ,Banaskantha,21.5,13.4,25.6,,,135,24.2,72.2,
4,2015-12-12,December,Winter,Deesa,GJ,Banaskantha,20.8,10.2,26.2,,,135,24.2,72.2,


### Columns Description
- `date_of_record` - The specific date on which the weather observations were recorded (in `YYYY-MM-DD` format).

- `month` - The month extracted from the date, useful for monthly trend analysis.

- `season` - The meteorological season corresponding to the date — typically `Winter (Dec–Feb)`, `Pre-Monsoon (Mar–May)`, `Monsoon (Jun–Sep)`, or `Post-Monsoon (Oct–Nov)` for India.

- `station_name` - The name of the weather station where the data was collected. Each station corresponds to a specific geographical location.

- `state` - The Indian state or union territory where the weather station is located.

- `district` - The district (administrative division) within the state where the weather station operates.

- `avg_temp` - The average daily temperature recorded at the station, measured in degrees Celsius (°C).

- `min_temp` - The minimum temperature observed during the day (°C).

- `max_temp` - The maximum temperature observed during the day (°C).  

- `wind_speed` - The wind speed recorded at the station, measured in kilometers per hour (km/h).

- `air_pressure` - The atmospheric pressure measured at the station, measured in hectopascals (hPa) or millibars (mb).

- `elevation` - The height above sea level of the weather station, measured in meters (m) — affects temperature and pressure readings.

- `latitude` - The geographical latitude (in decimal degrees) of the station, indicating its north–south position on the Earth.

- `longitude` - The geographical longitude (in decimal degrees) of the station, indicating its east–west position on the Earth.

- `rainfall` - The total rainfall measured during the day, measured in millimeters (mm).

In [222]:
# check all available columns
dataset.columns

Index(['date_of_record', 'month', 'season', 'station_name', 'state',
       'district', 'avg_temp', 'min_temp', 'max_temp', 'wind_speed',
       'air_pressure', 'elevation', 'latitude', 'longitude', 'rainfall'],
      dtype='object')

In [223]:
# Check last five rows
dataset.tail()

Unnamed: 0,date_of_record,month,season,station_name,state,district,avg_temp,min_temp,max_temp,wind_speed,air_pressure,elevation,latitude,longitude,rainfall
61983,2025-02-06,February,Winter,Surat / Magdala,GJ,Surat,24.3,17.0,31.0,20.2,1012.3,5,21.1141,72.7418,0.0
61984,2025-02-07,February,Winter,Surat / Magdala,GJ,Surat,25.5,19.0,33.0,14.4,1012.6,5,21.1141,72.7418,0.0
61985,2025-02-08,February,Winter,Surat / Magdala,GJ,Surat,25.3,17.0,33.0,9.8,1012.3,5,21.1141,72.7418,0.0
61986,2025-02-09,February,Winter,Surat / Magdala,GJ,Surat,25.3,17.0,34.0,8.3,1011.8,5,21.1141,72.7418,0.0
61987,2025-02-10,February,Winter,Surat / Magdala,GJ,Surat,24.6,17.0,30.7,14.0,1013.0,5,21.1141,72.7418,0.0


In [224]:
# Check random five rows
dataset.sample(5)

Unnamed: 0,date_of_record,month,season,station_name,state,district,avg_temp,min_temp,max_temp,wind_speed,air_pressure,elevation,latitude,longitude,rainfall
6647,2016-06-09,June,Summer,Bhuj-Rudramata,GJ,Kachchh,31.9,28.6,,,,78,23.25,69.6667,
35206,2022-05-28,May,Summer,Baroda / Vadodara,GJ,Vadodara,32.1,28.1,37.0,19.4,1003.8,34,22.3,73.25,0.0
33668,2022-04-08,April,Summer,Vallabh Vidyanagar,GJ,Anand,34.7,23.9,44.0,9.4,1006.7,41,22.5667,72.9333,0.0
8508,2021-07-15,July,Monsoon,Bhuj-Rudramata,GJ,Kachchh,30.5,26.6,35.0,9.7,999.4,78,23.25,69.6667,4.1
35041,2021-12-13,December,Winter,Baroda / Vadodara,GJ,Vadodara,23.3,17.0,29.4,5.5,1014.3,34,22.3,73.25,0.0


In [225]:
# check all columns datatypes also observe non-null values.
dataset.info()

# -------------------------------------------------------------------------------------------
# Observation :
# 1) There are total 61988 records available, range from 0 to 61987.
# 2) All columns except "date_of_record" are in correct datatype. convert "date_of_record" in datetime datatype. 
# 3) The "state" column we can remove. because, this dataset only for Gujarat state.
# 4) There are missing values in min_temp, max_temp, wind_speed, air_pressure, rainfall.
# -------------------------------------------------------------------------------------------

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61988 entries, 0 to 61987
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   date_of_record  61988 non-null  object 
 1   month           61988 non-null  object 
 2   season          61988 non-null  object 
 3   station_name    61988 non-null  object 
 4   state           61988 non-null  object 
 5   district        61988 non-null  object 
 6   avg_temp        61988 non-null  float64
 7   min_temp        60131 non-null  float64
 8   max_temp        50259 non-null  float64
 9   wind_speed      39895 non-null  float64
 10  air_pressure    39474 non-null  float64
 11  elevation       61988 non-null  int64  
 12  latitude        61988 non-null  float64
 13  longitude       61988 non-null  float64
 14  rainfall        40122 non-null  float64
dtypes: float64(8), int64(1), object(6)
memory usage: 7.1+ MB


### Checking Missing Values

In [226]:
# Get total missing values and it's percentages

total_missing = dataset.isnull().sum()
percentage_missing = np.round((total_missing * 100) / len(dataset), 2)

missing_value_df = pd.DataFrame([total_missing, percentage_missing], index=["Total_Missing", "%_Missing"]).T
missing_value_df

# -------------------------------------------------------------------------------------------
# Observation :
# 1) Here, we have missing values in dataset. columns 'min_temp' and 'max_temp' have 3% and 18.92% missing values resp.
# 2) columns 'wind_speed', 'air_pressure' and 'rainfall' have 35.64% , 36.32% , 35.27% missing values resp.
# 3) Columns with missing values 25% or greater need to drop respective column.
# 4) In imputation if variable is char then we will use mode function and if it is numeric then we will first check for outrliers if there are outrliers then we will impute by median else we will impute using mean.
# -------------------------------------------------------------------------------------------

Unnamed: 0,Total_Missing,%_Missing
date_of_record,0.0,0.0
month,0.0,0.0
season,0.0,0.0
station_name,0.0,0.0
state,0.0,0.0
district,0.0,0.0
avg_temp,0.0,0.0
min_temp,1857.0,3.0
max_temp,11729.0,18.92
wind_speed,22093.0,35.64


In [227]:
dataset[dataset["wind_speed"].isnull()]

Unnamed: 0,date_of_record,month,season,station_name,state,district,avg_temp,min_temp,max_temp,wind_speed,air_pressure,elevation,latitude,longitude,rainfall
0,2015-12-08,December,Winter,Deesa,GJ,Banaskantha,26.4,14.6,32.7,,,135,24.2000,72.2000,
1,2015-12-09,December,Winter,Deesa,GJ,Banaskantha,25.6,15.6,30.0,,,135,24.2000,72.2000,
2,2015-12-10,December,Winter,Deesa,GJ,Banaskantha,25.2,16.6,30.0,,,135,24.2000,72.2000,
3,2015-12-11,December,Winter,Deesa,GJ,Banaskantha,21.5,13.4,25.6,,,135,24.2000,72.2000,
4,2015-12-12,December,Winter,Deesa,GJ,Banaskantha,20.8,10.2,26.2,,,135,24.2000,72.2000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61826,2024-08-30,August,Monsoon,Surat / Magdala,GJ,Surat,28.9,26.6,,,,5,21.1141,72.7418,5.1
61827,2024-08-31,August,Monsoon,Surat / Magdala,GJ,Surat,29.1,26.5,,,,5,21.1141,72.7418,0.3
61828,2024-09-01,September,Monsoon,Surat / Magdala,GJ,Surat,29.5,26.4,,,,5,21.1141,72.7418,0.0
61829,2024-09-03,September,Monsoon,Surat / Magdala,GJ,Surat,26.8,24.8,,,,5,21.1141,72.7418,39.1


### Check Statistical Measure for Numerical Columns

In [None]:
# Mean and Median(50%) difference is more than 10% then there are possible outliers present in data.
dataset.describe().T

# -------------------------------------------------------------------------------------------
# Observation :
# 1) In 'elevation' column, mean and median difference is more than 10% then there are possible outliers present in column.
# -------------------------------------------------------------------------------------------

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
avg_temp,61988.0,27.370825,4.000929,12.2,24.9,28.0,29.9,40.9
min_temp,60131.0,22.313973,5.049992,1.4,18.6,24.0,26.2,31.8
max_temp,50259.0,32.84671,4.183345,18.6,30.0,32.5,35.2,50.3
wind_speed,39895.0,13.069522,6.229127,0.0,8.7,11.9,16.1,65.4
air_pressure,39474.0,1008.846887,5.191823,988.8,1004.9,1009.1,1013.2,1022.6
elevation,61988.0,32.506227,41.338043,0.0,5.0,10.0,50.0,135.0
latitude,61988.0,22.247595,0.937981,20.6167,21.3167,22.3667,23.0667,24.2
longitude,61988.0,71.169081,1.480146,68.85,69.6667,70.9167,72.7167,73.2667
rainfall,40122.0,3.747032,14.931792,0.0,0.0,0.0,0.7,485.9


In [229]:
temp_df = dataset.describe().T

temp_df["mean_median_diff"] = (np.abs(temp_df["mean"] - temp_df["50%"]) * 100) / (temp_df["50%"])

temp_df[(temp_df["mean_median_diff"] != float("inf")) & (temp_df["mean_median_diff"] > 10)]

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,mean_median_diff
elevation,61988.0,32.506227,41.338043,0.0,5.0,10.0,50.0,135.0,225.06227


### Check Duplicate Records

In [230]:
dataset.duplicated().sum()

# -------------------------------------------------------------------------------------------
# Observation :
# 1) There is no duplicated entries in dataset.
# -------------------------------------------------------------------------------------------

np.int64(0)

### Univariate Analysis