In [26]:

import pandas as pd
import numpy as np
import os
    

In [27]:

# Define the correct file path
file_path = r"C:\Cancer\Estimated Mortality of Cancer Cases in India.csv"
# Check if the file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"❌ File not found: {file_path}. Please check the file path!")

In [28]:

# Load dataset with error handling
try:
    data = pd.read_csv("C:\Cancer\Estimated Mortality of Cancer Cases in India.csv")
    print("📂 Dataset Loaded Successfully!")
except Exception as e:
    raise RuntimeError(f"❌ Error loading file: {e}")
# ✅ Columns Check Karo
print("📌 Dataset Columns:", data.columns)
    

📂 Dataset Loaded Successfully!
📌 Dataset Columns: Index(['State/UT', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', 'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')


In [41]:
# Drop unnamed empty columns if they exist
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')

# Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Deaths')

# Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)

# Convert Year and Deaths to proper data types
data_long['Year'] = data_long['Year'].astype(str).str.strip()  # Ensure Year is string
data_long['Deaths'] = pd.to_numeric(data_long['Deaths'], errors='coerce')  # Convert Deaths to numbers

# Drop rows where Deaths is NaN (if any)
data_long.dropna(subset=['Deaths'], inplace=True)

# Display transformed data
data_long.head()


Unnamed: 0,region,Year,Deaths
0,Jammu & Kashmir,2014,6130.0
2,Himachal Pradesh,2014,4239.0
3,Punjab,2014,19393.0
4,Chandigarh,2014,470.0
5,Uttarakhand,2014,5460.0


In [47]:
# ✅ Ensure necessary columns exist
required_columns = {'region', 'Year', 'Deaths'}
missing_columns = required_columns - set(data_long.columns)
if missing_columns:
    raise ValueError(f"❌ Missing required columns: {missing_columns}")


In [48]:
# ✅ Feature Engineering
# ✅ Sort data for time-based operations
data_long = data_long.sort_values(by=['region', 'Year'])


In [50]:
# ✅ Rolling Average (7-year moving average)
data_long['Rolling_Avg_Deaths'] = data_long.groupby('region')['Deaths'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())


In [51]:
# ✅ Lag Features (Previous year's deaths)
data_long['Lag_Deaths'] = data_long.groupby('region')['Deaths'].shift(1)



In [52]:
# ✅ Growth Rate Calculation
data_long['Growth_Rate'] = ((data_long['Deaths'] - data_long['Lag_Deaths']) / data_long['Lag_Deaths']) * 100

# ✅ Cumulative Deaths per Region
data_long['Cumulative_Deaths'] = data_long.groupby('region')['Deaths'].cumsum()

# ✅ Save transformed dataset
output_path = "Transformed_Cancer_Mortality.csv"
data_long.to_csv(output_path, index=False)
print(f"✅ Transformed dataset saved as: {output_path}")

# ✅ Display sample output
print(data_long.head())


✅ Transformed dataset saved as: Transformed_Cancer_Mortality.csv
                        region  Year  Deaths  Rolling_Avg_Deaths  Lag_Deaths   
36   Andaman & Nicobar Islands  2014   170.0          170.000000         NaN  \
75   Andaman & Nicobar Islands  2015   175.0          172.500000       170.0   
114  Andaman & Nicobar Islands  2016   180.0          175.000000       175.0   
153  Andaman & Nicobar Islands  2017   186.0          180.333333       180.0   
192  Andaman & Nicobar Islands  2018   193.0          186.333333       186.0   

     Growth_Rate  Cumulative_Deaths  
36           NaN              170.0  
75      2.941176              345.0  
114     2.857143              525.0  
153     3.333333              711.0  
192     3.763441              904.0  


In [37]:

# Save Processed Data
output_path = r"C:\Cancer\Estimated_Mortality_Featured.csv"
data.to_csv(output_path, index=False)

print(f"✅ Feature Engineering Complete! Processed file saved at: {output_path}")
print("📊 Sample Processed Data:")
print(data.head())
    

✅ Feature Engineering Complete! Processed file saved at: C:\Cancer\Estimated_Mortality_Featured.csv
📊 Sample Processed Data:
           State/UT     2014     2015     2016     2017     2018     2019   
0   Jammu & Kashmir   6130.0   6306.0   6464.0   6645.0   6824.0   6844.0  \
1            Ladakh      NaN      NaN      NaN      NaN      NaN    159.0   
2  Himachal Pradesh   4239.0   4347.0   4434.0   4534.0   4642.0   4744.0   
3            Punjab  19393.0  19858.0  20332.0  20784.0  21278.0  21763.0   
4        Chandigarh    470.0    487.0    501.0    517.0    532.0    548.0   

      2020     2021     2022     2023  
0   7027.0   7211.0   7396.0   7575.0  
1    162.0    166.0    171.0    175.0  
2   4856.0   4953.0   5058.0   5180.0  
3  22276.0  22786.0  23301.0  23865.0  
4    564.0    582.0    598.0    612.0  


In [39]:
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')

# ✅ Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Deaths')

# ✅ Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)

# ✅ Convert Year and Deaths to proper data types
data_long['Year'] = data_long['Year'].astype(str).str.strip()  # Ensure Year is string
data_long['Deaths'] = pd.to_numeric(data_long['Deaths'], errors='coerce')  # Convert Deaths to numbers

# ✅ Drop rows where Deaths is NaN (if any)
data_long.dropna(subset=['Deaths'], inplace=True)

# ✅ Feature Engineering
# ✅ Sort data for time-based operations
data_long = data_long.sort_values(by=['region', 'Year'])

# ✅ Rolling Average (7-year moving average)
data_long['Rolling_Avg_Deaths'] = data_long.groupby('region')['Deaths'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# ✅ Lag Features (Previous year's deaths)
data_long['Lag_Deaths'] = data_long.groupby('region')['Deaths'].shift(1)

# ✅ Growth Rate Calculation
data_long['Growth_Rate'] = ((data_long['Deaths'] - data_long['Lag_Deaths']) / data_long['Lag_Deaths']) * 100

# ✅ Cumulative Deaths per Region
data_long['Cumulative_Deaths'] = data_long.groupby('region')['Deaths'].cumsum()

# ✅ Save transformed dataset
output_path = "Transformed_Cancer_Mortality.csv"
data_long.to_csv(output_path, index=False)
print(f"✅ Transformed dataset saved as: {output_path}")

# ✅ Display sample output
print(data_long.head())


✅ Transformed dataset saved as: Transformed_Cancer_Mortality.csv
                        region  Year  Deaths  Rolling_Avg_Deaths  Lag_Deaths   
36   Andaman & Nicobar Islands  2014   170.0          170.000000         NaN  \
75   Andaman & Nicobar Islands  2015   175.0          172.500000       170.0   
114  Andaman & Nicobar Islands  2016   180.0          175.000000       175.0   
153  Andaman & Nicobar Islands  2017   186.0          180.333333       180.0   
192  Andaman & Nicobar Islands  2018   193.0          186.333333       186.0   

     Growth_Rate  Cumulative_Deaths  
36           NaN              170.0  
75      2.941176              345.0  
114     2.857143              525.0  
153     3.333333              711.0  
192     3.763441              904.0  
