In [27]:
# 📦 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ✅ Step 2: Load Cleaned Dataset
clean_data_df = pd.read_csv("clean_data_after_eda.csv")

# ✅ Step 3: Initial Check – Shape & Info
print("Data Shape:", clean_data_df.shape)
print("\n--- Data Info ---")
print(clean_data_df.info())

# ✅ Step 4: Remove Unnecessary Columns
# Columns with only 1 unique value or irrelevant to prediction
cols_to_drop = ['id', 'date_end']  # Change this list based on actual dataset
clean_data_df.drop(columns=cols_to_drop, inplace=True, errors='ignore')  # avoid error if column not found

# ✅ Step 5: Handle Dates – Convert and Extract Features
# Convert date to datetime format
clean_data_df['date_modif_prod'] = pd.to_datetime(clean_data_df['date_modif_prod'], errors='coerce')

# Extract useful date parts (Feature Expansion)
clean_data_df['modif_month'] = clean_data_df['date_modif_prod'].dt.month
clean_data_df['modif_year'] = clean_data_df['date_modif_prod'].dt.year

# ✅ Step 6: Feature – Price Sensitivity (Off-Peak December vs January)
# Check actual column names first!
print("\n--- Column Names ---")
print(clean_data_df.columns)

# Use correct column names from your dataset
# If your dataset has: 'price_off_peak_var.Dec' and 'price_off_peak_var.Jan'
clean_data_df['price_diff_offpeak'] = clean_data_df['price_off_peak_var.Dec'] - clean_data_df['price_off_peak_var.Jan']

# ✅ Step 7: Feature – Average Peak Price (Combining Dec & Jan)
clean_data_df['avg_price_peak'] = (clean_data_df['price_peak_var.Dec'] + clean_data_df['price_peak_var.Jan']) / 2

# ✅ Step 8: Optional – Combine Usage & Margin to Create Profit Feature
clean_data_df['est_profit'] = clean_data_df['cons_12m'] * clean_data_df['net_margin']

# ✅ Step 9: Final Check – Head & Summary
print("\n--- Final Data Preview ---")
print(clean_data_df.head())

print("\n--- Final Columns ---")
print(clean_data_df.columns)

# ✅ Step 10: Save Updated Dataset
clean_data_df.to_csv("feature_engineered_data.csv", index=False)
print("\n✅ Feature engineered data saved as 'feature_engineered_data.csv'")


Data Shape: (14606, 44)

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14606 entries, 0 to 14605
Data columns (total 44 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              14606 non-null  object 
 1   channel_sales                   14606 non-null  object 
 2   cons_12m                        14606 non-null  int64  
 3   cons_gas_12m                    14606 non-null  int64  
 4   cons_last_month                 14606 non-null  int64  
 5   date_activ                      14606 non-null  object 
 6   date_end                        14606 non-null  object 
 7   date_modif_prod                 14606 non-null  object 
 8   date_renewal                    14606 non-null  object 
 9   forecast_cons_12m               14606 non-null  float64
 10  forecast_cons_year              14606 non-null  int64  
 11  forecast_discount_energy        14606 non-null  fl

KeyError: 'price_off_peak_var.Dec'