# This notebook containts detailed data analysis that can be later used for feature engineering and Machine Learning purposes.



In [1]:
import pandas as pd
import numpy as np

# Downloading csv file from resources and putting it in working directory.
df = pd.read_csv('daily-total-female-births-CA.csv', header=0, parse_dates=[0])

In [2]:
df.head()

Unnamed: 0,date,births
0,1959-01-01,35
1,1959-01-02,32
2,1959-01-03,30
3,1959-01-04,31
4,1959-01-05,44


In [4]:
df['date'].dtype

dtype('<M8[ns]')

### Loading Data as a series

In [5]:
sr = pd.read_csv('daily-total-female-births-CA.csv', header=0, parse_dates=[0], index_col=0, squeeze=True)

### Querying by time

In [7]:
print(sr['1959-01'])

date
1959-01-01    35
1959-01-02    32
1959-01-03    30
1959-01-04    31
1959-01-05    44
1959-01-06    29
1959-01-07    45
1959-01-08    43
1959-01-09    38
1959-01-10    27
1959-01-11    38
1959-01-12    33
1959-01-13    55
1959-01-14    47
1959-01-15    45
1959-01-16    37
1959-01-17    50
1959-01-18    43
1959-01-19    41
1959-01-20    52
1959-01-21    34
1959-01-22    53
1959-01-23    39
1959-01-24    32
1959-01-25    37
1959-01-26    43
1959-01-27    39
1959-01-28    35
1959-01-29    44
1959-01-30    38
1959-01-31    24
Name: births, dtype: int64


In [8]:
df[(df['date'] > '1959-01-01') & (df['date'] <= '1959-01-21')]  #Alternative to iloc

Unnamed: 0,date,births
1,1959-01-02,32
2,1959-01-03,30
3,1959-01-04,31
4,1959-01-05,44
5,1959-01-06,29
6,1959-01-07,45
7,1959-01-08,43
8,1959-01-09,38
9,1959-01-10,27
10,1959-01-11,38


### Descriptive Statistics

In [10]:
sr.describe()

count    365.000000
mean      41.980822
std        7.348257
min       23.000000
25%       37.000000
50%       42.000000
75%       46.000000
max       73.000000
Name: births, dtype: float64

In [11]:
df.describe()

Unnamed: 0,births
count,365.0
mean,41.980822
std,7.348257
min,23.0
25%,37.0
50%,42.0
75%,46.0
max,73.0


# Feature Engineering

### Date time features

In [13]:
features = df.copy()
features['year'] = df['date'].dt.year
features['month'] = df['date'].dt.month
features['day'] = df['date'].dt.day

df.head()

Unnamed: 0,date,births
0,1959-01-01,35
1,1959-01-02,32
2,1959-01-03,30
3,1959-01-04,31
4,1959-01-05,44


### Lag features

In [15]:
features['lag1'] =  df['births'].shift(1)

features['lag2'] =  df['births'].shift(365)

### Window features

In [17]:
features['Roll_mean'] = df['births'].rolling(window = 2).mean()

In [18]:
features['Roll_max'] = df['births'].rolling(window = 3).max()

### Expanding features

In [20]:
#Expanding is same as rolling window only there is no window size because expaning is by default to the last date

features['Expand_max'] = df['births'].expanding().max()