# 02 - Feature Engineering

This notebook creates features for time series modeling.

**Input**: `data/processed/cleaned_revenue_data.csv`  
**Output**: `data/processed/features_revenue_data.csv`

## Features Created:
- **Temporal**: `month_sin`, `month_cos` (cyclical encoding)
- **Lag features**: `revenue_lag_1`, `revenue_lag_12`
- **Rolling stats**: `revenue_ma_3`, `revenue_ma_6`
- **Business metrics**: `revenue_per_patient`, `insurance_ratio`

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)

## 1. Load Cleaned Data

In [2]:
# Load the cleaned data
df = pd.read_csv('../data/processed/cleaned_revenue_data.csv', parse_dates=['date'])
print(f"Loaded {len(df)} records")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
df.head()

Loaded 34 records
Date range: 2021-01-01 00:00:00 to 2023-10-01 00:00:00


Unnamed: 0,date,year,month,gp_patients,gp_insurance,gp_cash,im_patients,im_insurance,im_cash,dental_patients,dental_insurance,dental_cash,total_patients,total_revenue,pcr_patients,pcr_cash,final_revenue
0,2021-01-01,2021,1,507,54643.98,6919.0,0,0.0,0,74,11032.0,6700,581,79294.98,21,647.0,79941.98
1,2021-02-01,2021,2,554,51363.14,8107.0,0,0.0,0,57,33527.36,13343,611,106340.5,87,2031.0,108371.5
2,2021-03-01,2021,3,470,38923.15,8079.0,0,0.0,0,40,14244.44,11590,510,72836.59,253,5295.0,78131.59
3,2021-04-01,2021,4,357,33027.72,6228.0,38,5411.85,470,98,1302.0,7805,493,54244.57,779,17358.0,71602.57
4,2021-05-01,2021,5,436,48903.4,8314.0,96,14285.45,423,122,5421.92,9150,654,86497.77,423,11640.0,98137.77


## 2. Temporal Features

Cyclical encoding for months using sine/cosine to capture seasonality patterns.

In [3]:
# Cyclical encoding for month (captures seasonality)
df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

# Quarter feature
df['quarter'] = df['date'].dt.quarter

print("Temporal features added:")
df[['date', 'month', 'month_sin', 'month_cos', 'quarter']].head(12)

Temporal features added:


Unnamed: 0,date,month,month_sin,month_cos,quarter
0,2021-01-01,1,0.5,0.8660254,1
1,2021-02-01,2,0.8660254,0.5,1
2,2021-03-01,3,1.0,6.123234000000001e-17,1
3,2021-04-01,4,0.8660254,-0.5,2
4,2021-05-01,5,0.5,-0.8660254,2
5,2021-06-01,6,1.224647e-16,-1.0,2
6,2021-07-01,7,-0.5,-0.8660254,3
7,2021-08-01,8,-0.8660254,-0.5,3
8,2021-09-01,9,-1.0,-1.83697e-16,3
9,2021-10-01,10,-0.8660254,0.5,4


## 3. Lag Features

Create lag features to capture autocorrelation patterns.

In [4]:
# Sort by date to ensure proper lag calculation
df = df.sort_values('date').reset_index(drop=True)

# Lag features for final_revenue
df['revenue_lag_1'] = df['final_revenue'].shift(1)    # Previous month
df['revenue_lag_2'] = df['final_revenue'].shift(2)    # 2 months ago
df['revenue_lag_3'] = df['final_revenue'].shift(3)    # 3 months ago
df['revenue_lag_12'] = df['final_revenue'].shift(12)  # Same month last year

# Patient lag
df['patients_lag_1'] = df['total_patients'].shift(1)

print("Lag features:")
df[['date', 'final_revenue', 'revenue_lag_1', 'revenue_lag_12']].head(15)

Lag features:


Unnamed: 0,date,final_revenue,revenue_lag_1,revenue_lag_12
0,2021-01-01,79941.98,,
1,2021-02-01,108371.5,79941.98,
2,2021-03-01,78131.59,108371.5,
3,2021-04-01,71602.57,78131.59,
4,2021-05-01,98137.77,71602.57,
5,2021-06-01,122755.28,98137.77,
6,2021-07-01,105508.9,122755.28,
7,2021-08-01,126050.71,105508.9,
8,2021-09-01,129898.42,126050.71,
9,2021-10-01,145097.48,129898.42,


## 4. Rolling Statistics

Moving averages to capture trends.

In [5]:
# Rolling mean (moving averages)
df['revenue_ma_3'] = df['final_revenue'].rolling(window=3, min_periods=1).mean()
df['revenue_ma_6'] = df['final_revenue'].rolling(window=6, min_periods=1).mean()
df['revenue_ma_12'] = df['final_revenue'].rolling(window=12, min_periods=1).mean()

# Rolling standard deviation (volatility)
df['revenue_std_3'] = df['final_revenue'].rolling(window=3, min_periods=1).std()

# Patients rolling
df['patients_ma_3'] = df['total_patients'].rolling(window=3, min_periods=1).mean()

print("Rolling statistics:")
df[['date', 'final_revenue', 'revenue_ma_3', 'revenue_ma_6', 'revenue_ma_12']].tail(10)

Rolling statistics:


Unnamed: 0,date,final_revenue,revenue_ma_3,revenue_ma_6,revenue_ma_12
24,2023-01-01,140303.23,139742.25,133425.711667,116412.328333
25,2023-02-01,140263.68,145207.573333,136513.821667,119982.655
26,2023-03-01,132969.73,137845.546667,137907.138333,121409.215
27,2023-04-01,62729.86,111987.756667,125865.003333,120953.48
28,2023-05-01,124189.46,106629.683333,125918.628333,123805.745
29,2023-06-01,114684.28,100534.533333,119190.04,123857.155833
30,2023-07-01,151180.13,130017.956667,121002.856667,127214.284167
31,2023-08-01,134117.75,133327.386667,119978.535,128246.178333
32,2023-09-01,148917.33,144738.403333,122636.468333,130271.803333
33,2023-10-01,137676.67,140237.25,135127.603333,130496.303333


## 5. Business Metrics

In [6]:
# Revenue per patient
df['revenue_per_patient'] = df['final_revenue'] / df['total_patients']

# Total insurance revenue
df['total_insurance'] = df['gp_insurance'] + df['im_insurance'] + df['dental_insurance']

# Total cash revenue
df['total_cash'] = df['gp_cash'] + df['im_cash'] + df['dental_cash']

# Insurance ratio (percentage of revenue from insurance)
df['insurance_ratio'] = df['total_insurance'] / df['total_revenue']

# Department revenue shares
df['gp_revenue'] = df['gp_insurance'] + df['gp_cash']
df['im_revenue'] = df['im_insurance'] + df['im_cash']
df['dental_revenue'] = df['dental_insurance'] + df['dental_cash']

df['gp_share'] = df['gp_revenue'] / df['total_revenue']
df['im_share'] = df['im_revenue'] / df['total_revenue']
df['dental_share'] = df['dental_revenue'] / df['total_revenue']

print("Business metrics:")
df[['date', 'revenue_per_patient', 'insurance_ratio', 'gp_share', 'im_share', 'dental_share']].head(10)

Business metrics:


Unnamed: 0,date,revenue_per_patient,insurance_ratio,gp_share,im_share,dental_share
0,2021-01-01,137.593769,0.828249,0.776379,0.0,0.223621
1,2021-02-01,177.36743,0.798289,0.559243,0.0,0.440757
2,2021-03-01,153.199196,0.729957,0.64531,0.0,0.35469
3,2021-04-01,145.238479,0.732637,0.72368,0.108432,0.167888
4,2021-05-01,150.057752,0.793209,0.66149,0.170044,0.168466
5,2021-06-01,131.711674,0.833235,0.600803,0.183432,0.215765
6,2021-07-01,116.071397,0.809384,0.623535,0.22052,0.155945
7,2021-08-01,122.26063,0.85756,0.514142,0.268923,0.216935
8,2021-09-01,123.477586,0.845033,0.508241,0.241357,0.250403
9,2021-10-01,132.026824,0.863324,0.475679,0.204906,0.319415


## 6. Year-over-Year Changes

In [7]:
# Year-over-year change (requires 12 months of history)
df['revenue_yoy_change'] = (df['final_revenue'] - df['revenue_lag_12']) / df['revenue_lag_12']
df['patients_yoy_change'] = (df['total_patients'] - df['total_patients'].shift(12)) / df['total_patients'].shift(12)

# Month-over-month change
df['revenue_mom_change'] = (df['final_revenue'] - df['revenue_lag_1']) / df['revenue_lag_1']

print("Year-over-year changes (starting from month 13):")
df[['date', 'final_revenue', 'revenue_lag_12', 'revenue_yoy_change']].iloc[12:18]

Year-over-year changes (starting from month 13):


Unnamed: 0,date,final_revenue,revenue_lag_12,revenue_yoy_change
12,2022-01-01,99115.9,79941.98,0.239848
13,2022-02-01,97419.76,108371.5,-0.101057
14,2022-03-01,115851.01,78131.59,0.482768
15,2022-04-01,68198.68,71602.57,-0.047539
16,2022-05-01,89962.28,98137.77,-0.083306
17,2022-06-01,114067.35,122755.28,-0.070774


## 7. Feature Summary

In [8]:
# Summary of all features
print("=== Feature Summary ===")
print(f"Total features: {len(df.columns)}")
print(f"\nAll columns:")
for i, col in enumerate(df.columns):
    print(f"  {i+1}. {col}")

print(f"\nMissing values (features with NaN):")
missing = df.isnull().sum()
print(missing[missing > 0])

=== Feature Summary ===
Total features: 43

All columns:
  1. date
  2. year
  3. month
  4. gp_patients
  5. gp_insurance
  6. gp_cash
  7. im_patients
  8. im_insurance
  9. im_cash
  10. dental_patients
  11. dental_insurance
  12. dental_cash
  13. total_patients
  14. total_revenue
  15. pcr_patients
  16. pcr_cash
  17. final_revenue
  18. month_sin
  19. month_cos
  20. quarter
  21. revenue_lag_1
  22. revenue_lag_2
  23. revenue_lag_3
  24. revenue_lag_12
  25. patients_lag_1
  26. revenue_ma_3
  27. revenue_ma_6
  28. revenue_ma_12
  29. revenue_std_3
  30. patients_ma_3
  31. revenue_per_patient
  32. total_insurance
  33. total_cash
  34. insurance_ratio
  35. gp_revenue
  36. im_revenue
  37. dental_revenue
  38. gp_share
  39. im_share
  40. dental_share
  41. revenue_yoy_change
  42. patients_yoy_change
  43. revenue_mom_change

Missing values (features with NaN):
revenue_lag_1           1
revenue_lag_2           2
revenue_lag_3           3
revenue_lag_12         12
pati

In [9]:
# View final dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 43 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   date                 34 non-null     datetime64[ns]
 1   year                 34 non-null     int64         
 2   month                34 non-null     int64         
 3   gp_patients          34 non-null     int64         
 4   gp_insurance         34 non-null     float64       
 5   gp_cash              34 non-null     float64       
 6   im_patients          34 non-null     int64         
 7   im_insurance         34 non-null     float64       
 8   im_cash              34 non-null     int64         
 9   dental_patients      34 non-null     int64         
 10  dental_insurance     34 non-null     float64       
 11  dental_cash          34 non-null     int64         
 12  total_patients       34 non-null     int64         
 13  total_revenue        34 non-null     

In [10]:
# Preview key features
key_features = ['date', 'year', 'month', 'month_sin', 'month_cos', 
                'final_revenue', 'revenue_lag_1', 'revenue_lag_12',
                'revenue_ma_3', 'revenue_ma_6', 
                'revenue_per_patient', 'insurance_ratio']
df[key_features].tail(15)

Unnamed: 0,date,year,month,month_sin,month_cos,final_revenue,revenue_lag_1,revenue_lag_12,revenue_ma_3,revenue_ma_6,revenue_per_patient,insurance_ratio
19,2022-08-01,2022,8,-0.8660254,-0.5,121735.02,110894.59,126050.71,115565.653333,103451.488333,92.083979,0.890571
20,2022-09-01,2022,9,-1.0,-1.83697e-16,124609.83,121735.02,129898.42,119079.813333,104911.291667,94.544636,0.873972
21,2022-10-01,2022,10,-0.8660254,0.5,134982.67,124609.83,145097.48,127109.173333,116041.956667,94.724681,0.900315
22,2022-11-01,2022,11,-0.5,0.8660254,123867.71,134982.67,157051.49,127820.07,121692.861667,92.43859,0.89826
23,2022-12-01,2022,12,-2.449294e-16,1.0,155055.81,123867.71,133400.72,137968.73,128524.271667,105.336827,0.909762
24,2023-01-01,2023,1,0.5,0.8660254,140303.23,155055.81,99115.9,139742.25,133425.711667,102.560841,0.906964
25,2023-02-01,2023,2,0.8660254,0.5,140263.68,140303.23,97419.76,145207.573333,136513.821667,109.154615,0.896944
26,2023-03-01,2023,3,1.0,6.123234000000001e-17,132969.73,140263.68,115851.01,137845.546667,137907.138333,106.546258,0.898413
27,2023-04-01,2023,4,0.8660254,-0.5,62729.86,132969.73,68198.68,111987.756667,125865.003333,68.557224,0.91854
28,2023-05-01,2023,5,0.5,-0.8660254,124189.46,62729.86,89962.28,106629.683333,125918.628333,96.270899,0.847974


## 8. Export Feature Data

In [11]:
# Save feature-enriched data
output_path = Path('../data/processed/features_revenue_data.csv')
df.to_csv(output_path, index=False)

print(f"Saved feature data to: {output_path}")
print(f"Shape: {df.shape}")

Saved feature data to: ../data/processed/features_revenue_data.csv
Shape: (34, 43)


In [12]:
# Verification
df_verify = pd.read_csv(output_path)
print(f"Verified: {len(df_verify)} rows, {len(df_verify.columns)} columns")

Verified: 34 rows, 43 columns
