# Time Series

## Exploring and Understanding

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

import plotly
import plotly.express as px
import plotly.graph_objects as go

from statsmodels.tsa.seasonal import seasonal_decompose
import plotly.subplots as sp
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler



In [73]:
# Load dataset

df = pd.read_csv("DailyDelhiClimateTrain.csv")
df.head()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,10.0,84.5,0.0,1015.666667
1,2013-01-02,7.4,92.0,2.98,1017.8
2,2013-01-03,7.166667,87.0,4.633333,1018.666667
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667
4,2013-01-05,6.0,86.833333,3.7,1016.5


In [74]:
# Initial descriptive statistics
df.describe()

Unnamed: 0,meantemp,humidity,wind_speed,meanpressure
count,1462.0,1462.0,1462.0,1462.0
mean,25.495521,60.771702,6.802209,1011.104548
std,7.348103,16.769652,4.561602,180.231668
min,6.0,13.428571,0.0,-3.041667
25%,18.857143,50.375,3.475,1001.580357
50%,27.714286,62.625,6.221667,1008.563492
75%,31.305804,72.21875,9.238235,1014.944901
max,38.714286,100.0,42.22,7679.333333


In [75]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:", missing_values)

Missing values: date            0
meantemp        0
humidity        0
wind_speed      0
meanpressure    0
dtype: int64


In [76]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   date          1462 non-null   object 
 1   meantemp      1462 non-null   float64
 2   humidity      1462 non-null   float64
 3   wind_speed    1462 non-null   float64
 4   meanpressure  1462 non-null   float64
dtypes: float64(4), object(1)
memory usage: 57.2+ KB


In [77]:
# Ensure 'time' column is parsed as datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Display the first few rows to confirm changes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1462 entries, 0 to 1461
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   date          1462 non-null   datetime64[ns]
 1   meantemp      1462 non-null   float64       
 2   humidity      1462 non-null   float64       
 3   wind_speed    1462 non-null   float64       
 4   meanpressure  1462 non-null   float64       
dtypes: datetime64[ns](1), float64(4)
memory usage: 57.2 KB


In [78]:
# Perform seasonal decomposition
result = seasonal_decompose(df['meantemp'], model='additive', period=365)
# Plot the decomposed components
fig = sp.make_subplots(rows=4, cols=1, shared_xaxes=True, 
                       subplot_titles=['Observed', 'Trend', 'Seasonal', 'Residual'])
fig.add_trace(go.Scatter(x=df["date"], y=result.observed, mode='lines', name='Observed'), row=1, col=1)
fig.add_trace(go.Scatter(x=df["date"], y=result.trend, mode='lines', name='Trend'), row=2, col=1)
fig.add_trace(go.Scatter(x=df["date"], y=result.seasonal, mode='lines', name='Seasonal'), row=3, col=1)
fig.add_trace(go.Scatter(x=df["date"], y=result.resid, mode='lines', name='Residual'), row=4, col=1)
fig.show()

In [79]:
# Standardize the dataset
scaler = StandardScaler()
df_scaled = df.copy()
df_scaled[['meantemp']] = scaler.fit_transform(df[['meantemp']])

# Display the first few rows of the scaled dataset
df_scaled.head()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure
0,2013-01-01,-2.1095,84.5,0.0,1015.666667
1,2013-01-02,-2.463454,92.0,2.98,1017.8
2,2013-01-03,-2.495219,87.0,4.633333,1018.666667
3,2013-01-04,-2.291015,71.333333,1.233333,1017.166667
4,2013-01-05,-2.654044,86.833333,3.7,1016.5


# Feature Engineering

### 1. Date-Time Features
We extract the Year, Month, and Day from the 'date' column.

In [80]:
# Extract Year, Month, Day
df['Year'] = df['date'].dt.year
df['Month'] = df['date'].dt.month
df['Day'] = df['date'].dt.day

In [81]:
df.head()

Unnamed: 0,date,meantemp,humidity,wind_speed,meanpressure,Year,Month,Day
0,2013-01-01,10.0,84.5,0.0,1015.666667,2013,1,1
1,2013-01-02,7.4,92.0,2.98,1017.8,2013,1,2
2,2013-01-03,7.166667,87.0,4.633333,1018.666667,2013,1,3
3,2013-01-04,8.666667,71.333333,1.233333,1017.166667,2013,1,4
4,2013-01-05,6.0,86.833333,3.7,1016.5,2013,1,5


### 2. Lag Features
We create lag features for the same day last week, last month, and last year.

In [88]:
# Lag features
df['Lag_Week'] = df['meantemp'].shift(7)
df['Lag_Month'] = df['meantemp'].shift(30)
df['Lag_Year'] = df['meantemp'].shift(365)

df[['date', 'meantemp', 'Lag_Week', 'Lag_Month', 'Lag_Year']]
#df

Unnamed: 0,date,meantemp,Lag_Week,Lag_Month,Lag_Year
0,2013-01-01,10.000000,,,
1,2013-01-02,7.400000,,,
2,2013-01-03,7.166667,,,
3,2013-01-04,8.666667,,,
4,2013-01-05,6.000000,,,
...,...,...,...,...,...
1457,2016-12-28,17.217391,18.050000,22.454545,16.375000
1458,2016-12-29,15.238095,17.285714,21.611111,15.500000
1459,2016-12-30,14.095238,15.550000,19.869565,15.000000
1460,2016-12-31,15.052632,17.318182,19.750000,14.714286


### 3. Window Features
We create a 2-month rolling average feature.

In [90]:
# Rolling window feature
df['Rolling_Mean_2M'] = df['meantemp'].rolling(window=60).mean()

# Display the results
df[['date', 'meantemp', 'Rolling_Mean_2M']]

Unnamed: 0,date,meantemp,Rolling_Mean_2M
0,2013-01-01,10.000000,
1,2013-01-02,7.400000,
2,2013-01-03,7.166667,
3,2013-01-04,8.666667,
4,2013-01-05,6.000000,
...,...,...,...
1457,2016-12-28,17.217391,20.692491
1458,2016-12-29,15.238095,20.532691
1459,2016-12-30,14.095238,20.358637
1460,2016-12-31,15.052632,20.203104


### 4. Expanding Feature
We create an expanding feature that shows the maximum value till date.

In [103]:
df.index = pd.to_datetime(df['date'])
# Expanding feature
df['Expanding_Max'] = df['meantemp'].expanding().max()

# Display the results
df[['date', 'meantemp', 'Expanding_Max']]

Unnamed: 0_level_0,date,meantemp,Expanding_Max
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,2013-01-01,10.000000,10.000000
2013-01-02,2013-01-02,7.400000,10.000000
2013-01-03,2013-01-03,7.166667,10.000000
2013-01-04,2013-01-04,8.666667,10.000000
2013-01-05,2013-01-05,6.000000,10.000000
...,...,...,...
2016-12-28,2016-12-28,17.217391,38.714286
2016-12-29,2016-12-29,15.238095,38.714286
2016-12-30,2016-12-30,14.095238,38.714286
2016-12-31,2016-12-31,15.052632,38.714286


## Quarterly and Yearly Data Extraction
We create an additional column "Q" to show the quarterly data and another for yearly data.

In [104]:
# Quarterly data
df['Q'] = df['date'].resample('Q').mean()

df[['date', 'meantemp', 'Q']]

Unnamed: 0_level_0,date,meantemp,Q
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,2013-01-01,10.000000,NaT
2013-01-02,2013-01-02,7.400000,NaT
2013-01-03,2013-01-03,7.166667,NaT
2013-01-04,2013-01-04,8.666667,NaT
2013-01-05,2013-01-05,6.000000,NaT
...,...,...,...
2016-12-28,2016-12-28,17.217391,NaT
2016-12-29,2016-12-29,15.238095,NaT
2016-12-30,2016-12-30,14.095238,NaT
2016-12-31,2016-12-31,15.052632,2016-11-15 12:00:00


In [105]:
# Yearly data
df['Y'] = df['meantemp'].resample('Y').mean()

df[['date', 'meantemp', 'Y']]

Unnamed: 0_level_0,date,meantemp,Y
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-01-01,2013-01-01,10.000000,
2013-01-02,2013-01-02,7.400000,
2013-01-03,2013-01-03,7.166667,
2013-01-04,2013-01-04,8.666667,
2013-01-05,2013-01-05,6.000000,
...,...,...,...
2016-12-28,2016-12-28,17.217391,
2016-12-29,2016-12-29,15.238095,
2016-12-30,2016-12-30,14.095238,
2016-12-31,2016-12-31,15.052632,27.103373
