In [4]:
import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv("Estimated Mortality of Cancer Cases in India.csv")

# Convert date column to datetime format if applicable (modify column name as needed)
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'])

# Sort data if applicable (modify column names as needed)
if 'region' in data.columns and 'date' in data.columns:
    data = data.sort_values(by=['region', 'date'])

# Handle missing values
data.fillna(method='ffill', inplace=True)

def feature_engineering(df):
    """ Perform feature engineering on Cancer Mortality dataset """
    
    # Rolling Averages (7-day moving average, modify column name as needed)
    if 'mortality' in df.columns:
        df['mortality_7day_avg'] = df.groupby('region')['mortality'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
    
    # Lag Features (Previous day's mortality, modify column name as needed)
    if 'mortality' in df.columns:
        df['mortality_lag_1'] = df.groupby('region')['mortality'].shift(1)
    
    # Growth Rate of Mortality
    if 'mortality' in df.columns:
        df['growth_rate_mortality'] = df.groupby('region')['mortality'].pct_change().fillna(0)
    
    # Cumulative Mortality per Region
    if 'mortality' in df.columns:
        df['cumulative_mortality'] = df.groupby('region')['mortality'].cumsum()
    
    return df

# Apply feature engineering
data = feature_engineering(data)

# Save processed data
data.to_csv("Estimated_Mortality_Featured.csv", index=False)

# Display sample output
print(data.head())


           State/UT     2014     2015     2016     2017     2018     2019   
0   Jammu & Kashmir   6130.0   6306.0   6464.0   6645.0   6824.0   6844.0  \
1            Ladakh   6130.0   6306.0   6464.0   6645.0   6824.0    159.0   
2  Himachal Pradesh   4239.0   4347.0   4434.0   4534.0   4642.0   4744.0   
3            Punjab  19393.0  19858.0  20332.0  20784.0  21278.0  21763.0   
4        Chandigarh    470.0    487.0    501.0    517.0    532.0    548.0   

      2020     2021     2022     2023  Unnamed: 11  Unnamed: 12  
0   7027.0   7211.0   7396.0   7575.0          NaN          NaN  
1    162.0    166.0    171.0    175.0          NaN          NaN  
2   4856.0   4953.0   5058.0   5180.0          NaN          NaN  
3  22276.0  22786.0  23301.0  23865.0          NaN          NaN  
4    564.0    582.0    598.0    612.0          NaN          NaN  


In [5]:
import pandas as pd
import numpy as np

# Load dataset
data = pd.read_csv("Estimated Mortality of Cancer Cases in India.csv")

# Convert date column to datetime format if applicable (modify column name as needed)
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'])

# Sort data if applicable (modify column names as needed)
if 'region' in data.columns and 'date' in data.columns:
    data = data.sort_values(by=['region', 'date'])

# Handle missing values
data.fillna(method='ffill', inplace=True)

def feature_engineering(df):
    """ Perform feature engineering on Cancer Mortality dataset """
    
    # Rolling Averages (7-day moving average, modify column name as needed)
    if 'mortality' in df.columns:
        df['mortality_7day_avg'] = df.groupby('region')['mortality'].transform(lambda x: x.rolling(window=7, min_periods=1).mean())
    
    # Lag Features (Previous day's mortality, modify column name as needed)
    if 'mortality' in df.columns:
        df['mortality_lag_1'] = df.groupby('region')['mortality'].shift(1)
        df['mortality_lag_7'] = df.groupby('region')['mortality'].shift(7)
    
    # Growth Rate of Mortality
    if 'mortality' in df.columns:
        df['growth_rate_mortality'] = df.groupby('region')['mortality'].pct_change().fillna(0)
    
    # Cumulative Mortality per Region
    if 'mortality' in df.columns:
        df['cumulative_mortality'] = df.groupby('region')['mortality'].cumsum()
    
    # Remove highly correlated features (if correlation > 0.9)
    corr_matrix = df.select_dtypes(include=[np.number]).corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.9)]
    df.drop(columns=to_drop, inplace=True)
    
    return df

# Apply feature engineering
data = feature_engineering(data)

# Save processed data
data.to_csv("Estimated_Mortality_Featured.csv", index=False)

# Display sample output
print(data.head())


           State/UT     2014  Unnamed: 11  Unnamed: 12
0   Jammu & Kashmir   6130.0          NaN          NaN
1            Ladakh   6130.0          NaN          NaN
2  Himachal Pradesh   4239.0          NaN          NaN
3            Punjab  19393.0          NaN          NaN
4        Chandigarh    470.0          NaN          NaN
