In [13]:
import pandas as pd
import numpy as np
import os

# Define the correct file path
file_path = r"C:\Cancer\\Estimated Mortality of Cancer Cases in India.csv"

if os.path.exists(file_path):
    print("✅ File found!")
else:
    print("❌ File NOT found! Check the file path.")

✅ File found!


In [26]:
try:
    data = pd.read_csv("C:\Cancer\Estimated Mortality of Cancer Cases in India.csv")
    print("📂 Dataset Loaded Successfully!")
except Exception as e:
    raise RuntimeError(f"❌ Error loading file: {e}")

    
   

📂 Dataset Loaded Successfully!


In [24]:
# ✅ Columns Check Karo
print("📌 Dataset Columns:", data.columns)


📌 Dataset Columns: Index(['State/UT', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', 'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')


In [25]:
# ✅ Drop unnamed empty columns
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')


In [27]:
# ✅ Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Deaths')


In [28]:
# ✅ Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)


In [31]:
# ✅ Ensure 'Year' contains only valid numbers
data_long['Year'] = data_long['Year'].astype(str).str.extract('(\d+)')  # Extract only numbers
data_long.dropna(subset=['Year'], inplace=True)  # Drop rows where Year is NaN
data_long['Year'] = data_long['Year'].astype(int)  # Convert Year to integer


In [32]:
# ✅ Convert 'Deaths' to numeric
data_long['Deaths'] = pd.to_numeric(data_long['Deaths'], errors='coerce')


In [None]:
data_long.dropna(subset=['Deaths'], inplace=True)


In [None]:
# ✅ Sort data by region and Year
data_long.sort_values(by=['region', 'Year'], inplace=True)


In [35]:
# ✅ Apply Feature Engineering
# 🔹 Rolling Average (3-year moving average)
data_long['Rolling_Avg_Deaths'] = data_long.groupby('region')['Deaths'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())


In [36]:
# 🔹 Lag Feature (Previous Year's Deaths)
data_long['Lag_Deaths'] = data_long.groupby('region')['Deaths'].shift(1)

# 🔹 Growth Rate Calculation (Year-over-Year Change)
data_long['Growth_Rate'] = (data_long['Deaths'] - data_long['Lag_Deaths']) / data_long['Lag_Deaths'] * 100

# 🔹 Cumulative Deaths per Region
data_long['Cumulative_Deaths'] = data_long.groupby('region')['Deaths'].cumsum()

# ✅ Save transformed dataset
output_path = "Transformed_Cancer_Mortality.csv"
data_long.to_csv(output_path, index=False)
print(f"✅ Transformed dataset saved as: {output_path}")

# ✅ Display sample output
print(data_long.head())


✅ Transformed dataset saved as: Transformed_Cancer_Mortality.csv
                        region  Year  Deaths  Rolling_Avg_Deaths  Lag_Deaths   
36   Andaman & Nicobar Islands  2014   170.0          170.000000         NaN  \
75   Andaman & Nicobar Islands  2015   175.0          172.500000       170.0   
114  Andaman & Nicobar Islands  2016   180.0          175.000000       175.0   
153  Andaman & Nicobar Islands  2017   186.0          180.333333       180.0   
192  Andaman & Nicobar Islands  2018   193.0          186.333333       186.0   

     Growth_Rate  Cumulative_Deaths  
36           NaN              170.0  
75      2.941176              345.0  
114     2.857143              525.0  
153     3.333333              711.0  
192     3.763441              904.0  


In [37]:
import pandas as pd
import numpy as np
import os

# ✅ Define the correct file path
file_path = r"C:\Cancer\Estimated Mortality of Cancer Cases in India.csv"

# ✅ Check if file exists
if os.path.exists(file_path):
    print("✅ File found!")
else:
    raise FileNotFoundError(f"❌ File NOT found! Check the file path: {file_path}")

# ✅ Load dataset with error handling
try: 
    data = pd.read_csv(file_path)
    print("📂 Dataset Loaded Successfully!")
except Exception as e:
    raise RuntimeError(f"❌ Error loading file: {e}")

# ✅ Columns Check
print("📌 Dataset Columns:", data.columns)

# ✅ Drop unnamed empty columns
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')

# ✅ Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Deaths')

# ✅ Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)

# ✅ Ensure 'Year' contains only valid numbers
data_long['Year'] = data_long['Year'].astype(str).str.extract('(\d+)')  # Extract only numbers
data_long.dropna(subset=['Year'], inplace=True)  # Drop rows where Year is NaN
data_long['Year'] = data_long['Year'].astype(int)  # Convert Year to integer

# ✅ Convert 'Deaths' to numeric
data_long['Deaths'] = pd.to_numeric(data_long['Deaths'], errors='coerce')

# ✅ Drop NaN values in 'Deaths'
data_long.dropna(subset=['Deaths'], inplace=True)

# ✅ Sort data by region and Year
data_long.sort_values(by=['region', 'Year'], inplace=True)

# ✅ Apply Feature Engineering
# 🔹 Rolling Average (3-year moving average)
data_long['Rolling_Avg_Deaths'] = data_long.groupby('region')['Deaths'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# 🔹 Lag Feature (Previous Year's Deaths)
data_long['Lag_Deaths'] = data_long.groupby('region')['Deaths'].shift(1)

# 🔹 Growth Rate Calculation (Avoid division by zero)
data_long['Growth_Rate'] = (data_long['Deaths'] - data_long['Lag_Deaths']) / data_long['Lag_Deaths'].replace(0, np.nan) * 100

# 🔹 Cumulative Deaths per Region
data_long['Cumulative_Deaths'] = data_long.groupby('region')['Deaths'].cumsum()

# ✅ Save transformed dataset
output_path = "Transformed_Cancer_Mortality.csv"
data_long.to_csv(output_path, index=False)
print(f"✅ Transformed dataset saved as: {output_path}")

# ✅ Display sample output
print(data_long.head())


✅ File found!
📂 Dataset Loaded Successfully!
📌 Dataset Columns: Index(['State/UT', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', 'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')
✅ Transformed dataset saved as: Transformed_Cancer_Mortality.csv
                        region  Year  Deaths  Rolling_Avg_Deaths  Lag_Deaths   
36   Andaman & Nicobar Islands  2014   170.0          170.000000         NaN  \
75   Andaman & Nicobar Islands  2015   175.0          172.500000       170.0   
114  Andaman & Nicobar Islands  2016   180.0          175.000000       175.0   
153  Andaman & Nicobar Islands  2017   186.0          180.333333       180.0   
192  Andaman & Nicobar Islands  2018   193.0          186.333333       186.0   

     Growth_Rate  Cumulative_Deaths  
36           NaN              170.0  
75      2.941176              345.0  
114     2.857143              525.0  
153     3.333333              711.0  
192     3.763441              904.0  
