In [1]:
import pandas as pd
import os

# ✅ Define the correct file path (CHANGE THIS IF NEEDED)
file_path = "Estimated Incidence of Cancer Cases in India.csv"

# ✅ Check if file exists
if not os.path.exists(file_path):
    raise FileNotFoundError(f"❌ File not found: {file_path}. Please check the file path!")

# ✅ Load dataset with error handling
try:
    data = pd.read_csv(file_path)
    print("✅ Dataset Loaded Successfully!")
except Exception as e:
    raise RuntimeError(f"❌ Error loading file: {e}")

# ✅ Drop unnamed empty columns
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')

# ✅ Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Incidence')

# ✅ Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)

# ✅ Convert Year and Incidence to proper data types
data_long['Year'] = data_long['Year'].astype(int)  # Convert Year to integer
data_long['Incidence'] = pd.to_numeric(data_long['Incidence'], errors='coerce')  # Convert Incidence to numbers

# ✅ Drop rows where Incidence is NaN (if any)
data_long.dropna(subset=['Incidence'], inplace=True)

# ✅ Sort data by region and Year
data_long.sort_values(by=['region', 'Year'], inplace=True)

# ✅ Apply Feature Engineering
# 🔹 Rolling Average (3-year moving average)
data_long['Rolling_Avg_Incidence'] = data_long.groupby('region')['Incidence'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# 🔹 Lag Feature (Previous Year's Incidence)
data_long['Lag_Incidence'] = data_long.groupby('region')['Incidence'].shift(1)

# 🔹 Growth Rate Calculation (Year-over-Year Change)
data_long['Growth_Rate'] = (data_long['Incidence'] - data_long['Lag_Incidence']) / data_long['Lag_Incidence'] * 100

# 🔹 Cumulative Incidence per Region
data_long['Cumulative_Incidence'] = data_long.groupby('region')['Incidence'].cumsum()

# ✅ Save transformed dataset
output_path = "Transformed_Cancer_Incidence.csv"
data_long.to_csv(output_path, index=False)
print(f"✅ Transformed dataset saved as: {output_path}")

# ✅ Display sample output
print(data_long.head())


✅ Dataset Loaded Successfully!
✅ Transformed dataset saved as: Transformed_Cancer_Incidence.csv
                        region  Year  Incidence  Rolling_Avg_Incidence   
36   Andaman & Nicobar Islands  2014      310.0             310.000000  \
75   Andaman & Nicobar Islands  2015      319.0             314.500000   
114  Andaman & Nicobar Islands  2016      331.0             320.000000   
153  Andaman & Nicobar Islands  2017      340.0             330.000000   
192  Andaman & Nicobar Islands  2018      351.0             340.666667   

     Lag_Incidence  Growth_Rate  Cumulative_Incidence  
36             NaN          NaN                 310.0  
75           310.0     2.903226                 629.0  
114          319.0     3.761755                 960.0  
153          331.0     2.719033                1300.0  
192          340.0     3.235294                1651.0  
