In [13]:
import pandas as pd
import numpy as np
import os

# Define the correct file path
file_path = r"C:\Cancer\\Estimated Mortality of Cancer Cases in India.csv"

if os.path.exists(file_path):
    print("‚úÖ File found!")
else:
    print("‚ùå File NOT found! Check the file path.")

‚úÖ File found!


In [26]:
try:
    data = pd.read_csv("C:\Cancer\Estimated Mortality of Cancer Cases in India.csv")
    print("üìÇ Dataset Loaded Successfully!")
except Exception as e:
    raise RuntimeError(f"‚ùå Error loading file: {e}")

    
   

üìÇ Dataset Loaded Successfully!


In [24]:
# ‚úÖ Columns Check Karo
print("üìå Dataset Columns:", data.columns)


üìå Dataset Columns: Index(['State/UT', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', 'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')


In [25]:
# ‚úÖ Drop unnamed empty columns
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')


In [27]:
# ‚úÖ Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Deaths')


In [28]:
# ‚úÖ Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)


In [31]:
# ‚úÖ Ensure 'Year' contains only valid numbers
data_long['Year'] = data_long['Year'].astype(str).str.extract('(\d+)')  # Extract only numbers
data_long.dropna(subset=['Year'], inplace=True)  # Drop rows where Year is NaN
data_long['Year'] = data_long['Year'].astype(int)  # Convert Year to integer


In [32]:
# ‚úÖ Convert 'Deaths' to numeric
data_long['Deaths'] = pd.to_numeric(data_long['Deaths'], errors='coerce')


In [None]:
data_long.dropna(subset=['Deaths'], inplace=True)


In [None]:
# ‚úÖ Sort data by region and Year
data_long.sort_values(by=['region', 'Year'], inplace=True)


In [35]:
# ‚úÖ Apply Feature Engineering
# üîπ Rolling Average (3-year moving average)
data_long['Rolling_Avg_Deaths'] = data_long.groupby('region')['Deaths'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())


In [36]:
# üîπ Lag Feature (Previous Year's Deaths)
data_long['Lag_Deaths'] = data_long.groupby('region')['Deaths'].shift(1)

# üîπ Growth Rate Calculation (Year-over-Year Change)
data_long['Growth_Rate'] = (data_long['Deaths'] - data_long['Lag_Deaths']) / data_long['Lag_Deaths'] * 100

# üîπ Cumulative Deaths per Region
data_long['Cumulative_Deaths'] = data_long.groupby('region')['Deaths'].cumsum()

# ‚úÖ Save transformed dataset
output_path = "Transformed_Cancer_Mortality.csv"
data_long.to_csv(output_path, index=False)
print(f"‚úÖ Transformed dataset saved as: {output_path}")

# ‚úÖ Display sample output
print(data_long.head())


‚úÖ Transformed dataset saved as: Transformed_Cancer_Mortality.csv
                        region  Year  Deaths  Rolling_Avg_Deaths  Lag_Deaths   
36   Andaman & Nicobar Islands  2014   170.0          170.000000         NaN  \
75   Andaman & Nicobar Islands  2015   175.0          172.500000       170.0   
114  Andaman & Nicobar Islands  2016   180.0          175.000000       175.0   
153  Andaman & Nicobar Islands  2017   186.0          180.333333       180.0   
192  Andaman & Nicobar Islands  2018   193.0          186.333333       186.0   

     Growth_Rate  Cumulative_Deaths  
36           NaN              170.0  
75      2.941176              345.0  
114     2.857143              525.0  
153     3.333333              711.0  
192     3.763441              904.0  


In [37]:
import pandas as pd
import numpy as np
import os

# ‚úÖ Define the correct file path
file_path = r"C:\Cancer\Estimated Mortality of Cancer Cases in India.csv"

# ‚úÖ Check if file exists
if os.path.exists(file_path):
    print("‚úÖ File found!")
else:
    raise FileNotFoundError(f"‚ùå File NOT found! Check the file path: {file_path}")

# ‚úÖ Load dataset with error handling
try: 
    data = pd.read_csv(file_path)
    print("üìÇ Dataset Loaded Successfully!")
except Exception as e:
    raise RuntimeError(f"‚ùå Error loading file: {e}")

# ‚úÖ Columns Check
print("üìå Dataset Columns:", data.columns)

# ‚úÖ Drop unnamed empty columns
data = data.drop(columns=[col for col in data.columns if "Unnamed" in col], errors='ignore')

# ‚úÖ Convert wide format (year columns) to long format
data_long = data.melt(id_vars=['State/UT'], var_name='Year', value_name='Deaths')

# ‚úÖ Rename columns to match required format
data_long.rename(columns={'State/UT': 'region'}, inplace=True)

# ‚úÖ Ensure 'Year' contains only valid numbers
data_long['Year'] = data_long['Year'].astype(str).str.extract('(\d+)')  # Extract only numbers
data_long.dropna(subset=['Year'], inplace=True)  # Drop rows where Year is NaN
data_long['Year'] = data_long['Year'].astype(int)  # Convert Year to integer

# ‚úÖ Convert 'Deaths' to numeric
data_long['Deaths'] = pd.to_numeric(data_long['Deaths'], errors='coerce')

# ‚úÖ Drop NaN values in 'Deaths'
data_long.dropna(subset=['Deaths'], inplace=True)

# ‚úÖ Sort data by region and Year
data_long.sort_values(by=['region', 'Year'], inplace=True)

# ‚úÖ Apply Feature Engineering
# üîπ Rolling Average (3-year moving average)
data_long['Rolling_Avg_Deaths'] = data_long.groupby('region')['Deaths'].transform(lambda x: x.rolling(window=3, min_periods=1).mean())

# üîπ Lag Feature (Previous Year's Deaths)
data_long['Lag_Deaths'] = data_long.groupby('region')['Deaths'].shift(1)

# üîπ Growth Rate Calculation (Avoid division by zero)
data_long['Growth_Rate'] = (data_long['Deaths'] - data_long['Lag_Deaths']) / data_long['Lag_Deaths'].replace(0, np.nan) * 100

# üîπ Cumulative Deaths per Region
data_long['Cumulative_Deaths'] = data_long.groupby('region')['Deaths'].cumsum()

# ‚úÖ Save transformed dataset
output_path = "Transformed_Cancer_Mortality.csv"
data_long.to_csv(output_path, index=False)
print(f"‚úÖ Transformed dataset saved as: {output_path}")

# ‚úÖ Display sample output
print(data_long.head())


‚úÖ File found!
üìÇ Dataset Loaded Successfully!
üìå Dataset Columns: Index(['State/UT', '2014', '2015', '2016', '2017', '2018', '2019', '2020',
       '2021', '2022', '2023', 'Unnamed: 11', 'Unnamed: 12'],
      dtype='object')
‚úÖ Transformed dataset saved as: Transformed_Cancer_Mortality.csv
                        region  Year  Deaths  Rolling_Avg_Deaths  Lag_Deaths   
36   Andaman & Nicobar Islands  2014   170.0          170.000000         NaN  \
75   Andaman & Nicobar Islands  2015   175.0          172.500000       170.0   
114  Andaman & Nicobar Islands  2016   180.0          175.000000       175.0   
153  Andaman & Nicobar Islands  2017   186.0          180.333333       180.0   
192  Andaman & Nicobar Islands  2018   193.0          186.333333       186.0   

     Growth_Rate  Cumulative_Deaths  
36           NaN              170.0  
75      2.941176              345.0  
114     2.857143              525.0  
153     3.333333              711.0  
192     3.763441              9