# Data Understanding 

In [43]:
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px



In [44]:
df = pd.read_csv("global_cancer_patients_2015_2024.csv")

In [45]:
# Checking Data Types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Patient_ID             50000 non-null  object 
 1   Age                    50000 non-null  int64  
 2   Gender                 49900 non-null  object 
 3   Country_Region         49001 non-null  object 
 4   Year                   50000 non-null  int64  
 5   Genetic_Risk           50000 non-null  float64
 6   Air_Pollution          50000 non-null  float64
 7   Alcohol_Use            50000 non-null  float64
 8   Smoking                50000 non-null  float64
 9   Obesity_Level          49001 non-null  float64
 10  Cancer_Type            49900 non-null  object 
 11  Cancer_Stage           50000 non-null  object 
 12  Treatment_Cost_USD     49001 non-null  float64
 13  Survival_Years         50000 non-null  float64
 14  Target_Severity_Score  50000 non-null  float64
dtypes:

In [46]:
# Checking  summary statistics for numerical
df.describe().round(2)

Unnamed: 0,Age,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,49001.0,49001.0,50000.0,50000.0
mean,54.42,2019.48,5.0,5.01,5.01,4.99,4.99,52484.68,5.01,4.95
std,20.22,2.87,2.89,2.89,2.89,2.88,2.89,27347.46,2.88,1.2
min,20.0,2015.0,0.0,0.0,0.0,0.0,0.0,5000.05,0.0,0.9
25%,37.0,2017.0,2.5,2.5,2.5,2.5,2.5,28702.23,2.5,4.12
50%,54.0,2019.0,5.0,5.0,5.0,5.0,5.0,52532.93,5.0,4.95
75%,72.0,2022.0,7.5,7.5,7.5,7.5,7.5,76207.16,7.5,5.78
max,89.0,2024.0,10.0,10.0,10.0,10.0,10.0,99999.84,10.0,9.16


In [47]:
# Checking  summary statistics for Categorical

df.describe(include= 'object')

Unnamed: 0,Patient_ID,Gender,Country_Region,Cancer_Type,Cancer_Stage
count,50000,49900,49001,49900,50000
unique,50000,3,10,8,5
top,PT0000000,Male,Australia,Colon,Stage II
freq,1,16763,4987,6360,10124


In [48]:
#checking duplicates
df.duplicated().sum()

np.int64(0)

In [49]:
#checking NAN (Missing Values)
df.isna().sum()

Patient_ID                 0
Age                        0
Gender                   100
Country_Region           999
Year                       0
Genetic_Risk               0
Air_Pollution              0
Alcohol_Use                0
Smoking                    0
Obesity_Level            999
Cancer_Type              100
Cancer_Stage               0
Treatment_Cost_USD       999
Survival_Years             0
Target_Severity_Score      0
dtype: int64

In [50]:
#checking NAN (Missing Values) Perectange to take the right descision (dropping or imputting ) 

df.isna().mean().round(4) * 100

Patient_ID               0.0
Age                      0.0
Gender                   0.2
Country_Region           2.0
Year                     0.0
Genetic_Risk             0.0
Air_Pollution            0.0
Alcohol_Use              0.0
Smoking                  0.0
Obesity_Level            2.0
Cancer_Type              0.2
Cancer_Stage             0.0
Treatment_Cost_USD       2.0
Survival_Years           0.0
Target_Severity_Score    0.0
dtype: float64

# Data Cleaning 

In [51]:
cat_cols = df.select_dtypes(include= 'object').drop('Patient_ID', axis= 1).columns
cat_cols

Index(['Gender', 'Country_Region', 'Cancer_Type', 'Cancer_Stage'], dtype='object')

In [52]:
for col in cat_cols:

    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    print('-' * 100)

Gender
3
['Male' 'Female' 'Other' nan]
----------------------------------------------------------------------------------------------------
Country_Region
10
['UK' 'China' 'Pakistan' 'Brazil' 'Germany' 'Canada' 'USA' 'India'
 'Australia' 'Russia' nan]
----------------------------------------------------------------------------------------------------
Cancer_Type
8
['Lung' 'Leukemia' 'Breast' 'Colon' 'Skin' 'Cervical' 'Prostate' 'Liver'
 nan]
----------------------------------------------------------------------------------------------------
Cancer_Stage
5
['Stage III' 'Stage 0' 'Stage II' 'Stage I' 'Stage IV']
----------------------------------------------------------------------------------------------------


### In depth check for numerical columns

In [53]:
num_cols = df.select_dtypes(include= 'number').columns
num_cols

Index(['Age', 'Year', 'Genetic_Risk', 'Air_Pollution', 'Alcohol_Use',
       'Smoking', 'Obesity_Level', 'Treatment_Cost_USD', 'Survival_Years',
       'Target_Severity_Score'],
      dtype='object')

In [54]:
for col in num_cols:

    px.histogram(data_frame= df, x= col, title= col).show()

# Feature Engineering

In [55]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62


In [56]:
df['Target_Severity_Score'].info

<bound method Series.info of 0        4.92
1        4.65
2        5.84
3        3.12
4        3.62
         ... 
49995    3.63
49996    6.03
49997    6.02
49998    6.54
49999    3.76
Name: Target_Severity_Score, Length: 50000, dtype: float64>

In [57]:
def target_acheived(x):
    if x > 5: 
        return 'Achieved'
    else:
        return 'Not Achieved'

df['Target_Severity_Score_Status'] = df['Target_Severity_Score'].apply(target_acheived)


print(df[['Target_Severity_Score', 'Target_Severity_Score_Status']])


       Target_Severity_Score Target_Severity_Score_Status
0                       4.92                 Not Achieved
1                       4.65                 Not Achieved
2                       5.84                     Achieved
3                       3.12                 Not Achieved
4                       3.62                 Not Achieved
...                      ...                          ...
49995                   3.63                 Not Achieved
49996                   6.03                     Achieved
49997                   6.02                     Achieved
49998                   6.54                     Achieved
49999                   3.76                 Not Achieved

[50000 rows x 2 columns]


In [58]:
bins = [0, 30, 50, 70, float('inf')]
labels = ['<30', '30-50', '50-70', '70+']

df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
df['Age_Group']

0          70+
1        30-50
2          70+
3        30-50
4        30-50
         ...  
49995      70+
49996    30-50
49997      70+
49998      <30
49999      <30
Name: Age_Group, Length: 50000, dtype: category
Categories (4, object): ['<30' < '30-50' < '50-70' < '70+']

In [59]:
def years_grouping(year):
    if 2015 <= year <= 2018:
        return '2015-2018'
    elif 2019 <= year <= 2021:
        return '2019-2021'
    elif 2022 <= year <= 2024:
        return '2022-2024'
    else:
        return 'Unknown'

df['Era'] = df['Year'].apply(years_grouping)
df['Era'] 


0        2019-2021
1        2019-2021
2        2022-2024
3        2015-2018
4        2015-2018
           ...    
49995    2022-2024
49996    2015-2018
49997    2015-2018
49998    2015-2018
49999    2022-2024
Name: Era, Length: 50000, dtype: object

In [60]:
df

Unnamed: 0,Patient_ID,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score,Target_Severity_Score_Status,Age_Group,Era
0,PT0000000,71,Male,UK,2021,6.4,2.8,9.5,0.9,8.7,Lung,Stage III,62913.44,5.9,4.92,Not Achieved,70+,2019-2021
1,PT0000001,34,Male,China,2021,1.3,4.5,3.7,3.9,6.3,Leukemia,Stage 0,12573.41,4.7,4.65,Not Achieved,30-50,2019-2021
2,PT0000002,80,Male,Pakistan,2023,7.4,7.9,2.4,4.7,0.1,Breast,Stage II,6984.33,7.1,5.84,Achieved,70+,2022-2024
3,PT0000003,40,Male,UK,2015,1.7,2.9,4.8,3.5,2.7,Colon,Stage I,67446.25,1.6,3.12,Not Achieved,30-50,2015-2018
4,PT0000004,43,Female,Brazil,2017,5.1,2.8,2.3,6.7,0.5,Skin,Stage III,77977.12,2.9,3.62,Not Achieved,30-50,2015-2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,PT0049995,80,Male,Pakistan,2023,2.3,7.5,2.8,3.8,2.9,Leukemia,Stage 0,71463.70,1.0,3.63,Not Achieved,70+,2022-2024
49996,PT0049996,40,Female,USA,2018,6.4,3.5,2.9,9.0,9.8,Colon,Stage I,49619.66,0.4,6.03,Achieved,30-50,2015-2018
49997,PT0049997,74,Male,Pakistan,2015,6.2,1.6,8.7,4.7,4.0,Breast,Stage 0,5338.25,2.7,6.02,Achieved,70+,2015-2018
49998,PT0049998,21,Male,Brazil,2018,4.0,6.5,7.6,8.6,8.1,Liver,Stage IV,45100.47,7.5,6.54,Achieved,<30,2015-2018


# Data Analysis

In [62]:
%%writefile Cancer_data.py
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px

st.set_page_config(
    layout='wide',
    page_title="Global Cancer Patients 2015–2024",
    page_icon=":cancer:"
)


st.markdown(
    """<h1 style='color: white; text-align: center;'> Global Cancer Patients 2015–2024 Analysis </h1>""",
    unsafe_allow_html=True
)

tab_1 , tab_2 , tab_3 , tab_4 , tab_5 = st.tabs(["Purpose and Introduction","Used Data Frame",
        "Univariate Analysis",
        "Bivariate analysis",
        "Multivariate Analysis"])

with tab_1 : 
    col1, col2 = st.columns([1, 2]) 
    with col2:
        st.image("images.jpeg", width=550)
        
  

    st.markdown("""<h1 style='color: white; text-align: center;'> Purpose </h1>""",unsafe_allow_html=True)
    st.markdown("""
<div style='text-align: center'>
    <p style="font-size:18px;">
    The central purpose of analyzing the Global Cancer Patient Dataset is to extract meaningful insights about the patterns and determinants of cancer over a ten-year period (2015–2024). By simulating global cancer-related data, this analysis empowers a wide range of stakeholders—including medical researchers, clinicians, policymakers, and AI developers—to understand how various factors interact in the onset, treatment, and survival outcomes of cancer.
    </p>
    <p style="font-size:18px;">
    Through this simulated dataset, the project aims to:
    <ul style="text-align: left; display: inline-block;">
        <li>Identify trends in cancer incidence and mortality across different regions.</li>
        <li>Evaluate the effectiveness and reach of various treatment protocols.</li>
        <li>Investigate the impact of socio-demographic, genetic, and environmental variables on patient outcomes.</li>
        <li>Provide a foundation for predictive modeling and decision support tools powered by AI.</li>
    </ul>
    </p>
    <p style="font-size:18px;">
    Ultimately, the insights derived from this analysis can help inform global health strategies, improve clinical practices, and guide future research in oncology.
    </p>
</div>
""", unsafe_allow_html=True)

    st.markdown("""<h1 style='color: white; text-align: center;'> Introduction </h1>""",unsafe_allow_html=True)
    st.markdown("""
<div style='text-align: center'>
    <p style="font-size:18px;">
    Welcome to this analytical overview of cancer—a disease that continues to challenge healthcare systems, researchers, and policymakers worldwide.
    </p>
    <p style="font-size:18px;">
    In this report, we begin with a foundational understanding of what cancer is at a biological level—how abnormal cell growth disrupts normal body functions and leads to life-threatening conditions if not detected and managed in time.
    </p>
    <p style="font-size:18px;">
    We then categorize cancer into its major types—including carcinomas, sarcomas, leukemias, lymphomas, and more—highlighting how each originates in different tissues and organs, with unique clinical and epidemiological profiles.
    </p>
    <p style="font-size:18px;">
    This analysis also covers the staging system, a critical framework used globally to classify cancer progression from Stage 0 to Stage IV. We’ll discuss how staging impacts prognosis, treatment planning, and survival outcomes.
    </p>
    <p style="font-size:18px;">
    Drawing on global datasets and simulated trends from 2015 to 2024, this video will help you contextualize how cancer types and stages have evolved across populations, and what patterns are emerging that could inform future medical research, resource allocation, and AI-driven prediction models.
    </p>
    <p style="font-size:18px;">
    Let’s dive into the data and insights shaping our understanding of cancer today.
    </p>
</div>
""", unsafe_allow_html=True)


    st.markdown("""<h3 style='color: white; text-align: center;'> What is Cancer? </h3>""",unsafe_allow_html=True)
    st.video("https://www.youtube.com/watch?v=LEpTTolebqo")

    st.markdown("""<h3 style='color: white; text-align: center;'> Explainer Video on Cancer Staging </h3>""",unsafe_allow_html=True)
    st.video("https://www.youtube.com/watch?v=yvHe3AiY3jg")
    st.markdown("""<h3 style='color: white; text-align: center;'> Explainer Video on Cancer Types </h3>""",unsafe_allow_html=True)
    st.video("https://www.youtube.com/watch?v=dEBi-yvSWmQ")


with tab_2:
    df = pd.read_csv("global_cancer_patients_2015_2024.csv")
    st.dataframe(df, height=800, width=1200)


with tab_3:
    st.markdown(
        """<h2 style='color: white; text-align: center;'> Univariate Analaysis </h2>""",
        unsafe_allow_html=True
    )

    df = pd.read_csv("global_cancer_patients_2015_2024.csv")
    df = df.drop("Patient_ID", axis=1)

    df["Year"] = pd.to_datetime(df["Year"], format='%Y')

    Start_Date = st.sidebar.date_input(
        "Start Date", value=df.Year.min(), min_value=df.Year.min(), max_value=df.Year.max()
    )
    End_Date = st.sidebar.date_input(
        "End Date", value=df.Year.min(), min_value=df.Year.min(), max_value=df.Year.max()
    )

    Aspect_Percentage = st.sidebar.radio(
        "Select the Aspect for getting the percentage of each Category",
        ['Gender', 'Country_Region', 'Year', 'Cancer_Type', 'Cancer_Stage'])
    st.markdown(
        """<h3 style='color: white; text-align: center;'> Selected Aspect Percentage from the Entire Database </h3>""",
        unsafe_allow_html=True)
    st.plotly_chart(px.pie(data_frame=df, names=Aspect_Percentage))
    st.markdown(
        """<h3 style='color: white; text-align: center;'> Which countries or regions have the highest number of reported cancer cases?</h3>""",
        unsafe_allow_html=True)
    st.plotly_chart(px.box(data_frame= df , x="Country_Region"))
    bins = [0, 30, 50, 70, float('inf')]
    labels = ['<30', '30-50', '50-70', '70+']
    df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels, right=False)
    st.markdown("""<h3 style='color: white; text-align: center;'>What is the least repeated Age group from 2015 to 2024?</h3>""",unsafe_allow_html=True)
    st.plotly_chart(px.bar(data_frame= df , x="Age_Group"))



with tab_4:
    st.markdown(
        """<h2 style='color: white; text-align: center;'> Bivariate Analysis </h2>""",
        unsafe_allow_html=True
    )

    df_filtered = df[
        (df["Year"] >= pd.to_datetime(Start_Date)) &
        (df["Year"] <= pd.to_datetime(End_Date))
    ]

    st.markdown(
        """<h3 style='color: white; text-align: center;'> What is the distribution of Air pullation between the chosen Date Range </h3>""",
        unsafe_allow_html=True
    )

    st.markdown(
        """<h6 style='color: white; text-align: center;'> Make sure to Choose start and End Date to Analyze Air Pollution Through the Years </h6>""",
        unsafe_allow_html=True
    )

    st.plotly_chart(px.histogram(data_frame=df_filtered,x="Air_Pollution",labels={"Air_Pollution": "Air Pollution Distribution"}))

    st.markdown(
        """<h3 style='color: white; text-align: center;'> Is there a correlation between the all aspects? </h3>""",
        unsafe_allow_html=True
    )

    correlation = df.select_dtypes(include='number').corr()

    st.plotly_chart(px.imshow(correlation,height=800,width=1200,text_auto=True))


    st.markdown(
        """<h3 style='color: white; text-align: center;'> Among patients of the selected age, how does the average treatment cost vary across different cancer types? </h3>""",
        unsafe_allow_html=True
    )
    st.markdown(
        """<h5 style='color: white; text-align: center;'> Select an Age from the Age Slider </h5>""",
        unsafe_allow_html=True
    )
    Age_Value = st.sidebar.slider("Age Slider", min_value=int(df["Age"].min()), max_value=int(df["Age"].max()))

    filtered_df = df[df["Age"] == Age_Value]
    avg_cost_by_cancer_type = filtered_df.groupby("Cancer_Type")["Treatment_Cost_USD"].mean().reset_index()
    st.plotly_chart(px.bar(data_frame=avg_cost_by_cancer_type,x="Cancer_Type",y="Treatment_Cost_USD"))







with tab_5:
    st.markdown(
        """<h2 style='color: white; text-align: center;'> Multi Analysis </h2>""",
        unsafe_allow_html=True
    )

    st.markdown(
        """<h3 style='color: white; text-align: center;'> How does cancer severity vary with smoking and genetic risk? </h3>""",
        unsafe_allow_html=True
    )

    df['Genetic_Risk_Bin'] = pd.cut(
        df['Genetic_Risk'],
        bins=[0, 0.3, 0.7, 1.0],
        labels=['Low', 'Medium', 'High']
    )
    df = df.dropna(subset=['Genetic_Risk_Bin'])

    grouped = df.groupby(["Smoking", "Genetic_Risk_Bin"]).agg(
        Avg_Severity=("Target_Severity_Score", "mean"),
        Count=("Target_Severity_Score", "count")
    ).reset_index()

    fig = px.scatter(
        grouped,
        x="Smoking",
        y="Avg_Severity",
        color="Genetic_Risk_Bin",
        labels={"Avg_Severity": "Avg. Cancer Severity Score"},
    )

    st.plotly_chart(fig)

    st.markdown(
        """<h3 style='color: white; text-align: center;'> How does the treatment cost vary across different cancer types and stages? </h3>""",
        unsafe_allow_html=True
    )

    df_cost = df.groupby(['Cancer_Type', 'Cancer_Stage'])['Treatment_Cost_USD'].sum().reset_index()

    fig= px.bar(
        df_cost,
        x="Cancer_Type",
        y="Treatment_Cost_USD",
        color="Cancer_Stage",
        barmode="group",
        labels={"Cancer_Type": "Cancer Type"}
    )

    st.plotly_chart(fig)

    st.markdown(
        """<h3 style='color: white; text-align: center;'> How Cancer Stage, Cost, and Region Relate to Survival Years</h3>""",
        unsafe_allow_html=True
    )
    fig2 = px.histogram(df,x="Cancer_Stage",y="Survival_Years",color="Country_Region",barmode="group",title="Survival Years vs Treatment Cost (Colored by Cancer Stage)")
    st.plotly_chart(fig2)

Overwriting Cancer_data.py


In [32]:
!streamlit run Cancer_data.py --server.port 8502


[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8502[0m
[34m  Network URL: [0m[1mhttp://192.168.1.10:8502[0m
[0m
[34m[1m  For better performance, install the Watchdog module:[0m

  $ xcode-select --install
  $ pip install watchdog
            [0m


^C
[34m  Stopping...[0m
Exception ignored in: <module 'threading' from '/opt/anaconda3/envs/Machine_Learning/lib/python3.9/threading.py'>
Traceback (most recent call last):
  File "/opt/anaconda3/envs/Machine_Learning/lib/python3.9/threading.py", line 1447, in _shutdown
    atexit_call()
  File "/opt/anaconda3/envs/Machine_Learning/lib/python3.9/concurrent/futures/thread.py", line 31, in _python_exit
    t.join()
  File "/opt/anaconda3/envs/Machine_Learning/lib/python3.9/threading.py", line 1060, in join
    self._wait_for_tstate_lock()
  File "/opt/anaconda3/envs/Machine_Learning/lib/python3.9/threading.py", line 1080, in _wait_for_tstate_lock
    if lock.a

# Deployment

## Data Preprocessing

In [63]:
df.drop('Patient_ID', axis= 1, inplace= True)

In [64]:
# we Must recheck Dups after dropping columns 
df.duplicated().sum()

np.int64(0)

### Handling Outliers

In [65]:
Needed_cols = df.select_dtypes(include="number").drop("Year" , axis=  1)
from datasist.structdata import detect_outliers
outliers_indicies = detect_outliers (data=df , n=0 , features= Needed_cols )
df.iloc[outliers_indicies]

df.drop(outliers_indicies , axis= 0  , inplace = True)
df.reset_index(inplace= True , drop=True)

In [66]:
#Checking the Number of remaining rows after handeling the Ourliers making sure that we aren't losing alot of data 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49849 entries, 0 to 49848
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype   
---  ------                        --------------  -----   
 0   Age                           49849 non-null  int64   
 1   Gender                        49749 non-null  object  
 2   Country_Region                48856 non-null  object  
 3   Year                          49849 non-null  int64   
 4   Genetic_Risk                  49849 non-null  float64 
 5   Air_Pollution                 49849 non-null  float64 
 6   Alcohol_Use                   49849 non-null  float64 
 7   Smoking                       49849 non-null  float64 
 8   Obesity_Level                 48856 non-null  float64 
 9   Cancer_Type                   49749 non-null  object  
 10  Cancer_Stage                  49849 non-null  object  
 11  Treatment_Cost_USD            48856 non-null  float64 
 12  Survival_Years                49849 non-null  

### Split Data into Input Features and Target Variable

In [67]:
x = df.drop('Target_Severity_Score', axis= 1)
y = df['Target_Severity_Score']

### Split Data into Train and Test and train

In [68]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 42)




In [69]:
x_train.isna().sum()

Age                               0
Gender                           75
Country_Region                  787
Year                              0
Genetic_Risk                      0
Air_Pollution                     0
Alcohol_Use                       0
Smoking                           0
Obesity_Level                   787
Cancer_Type                      78
Cancer_Stage                      0
Treatment_Cost_USD              787
Survival_Years                    0
Target_Severity_Score_Status      0
Age_Group                         0
Era                               0
dtype: int64

In [70]:
x_test.isna().sum()

Age                               0
Gender                           25
Country_Region                  206
Year                              0
Genetic_Risk                      0
Air_Pollution                     0
Alcohol_Use                       0
Smoking                           0
Obesity_Level                   206
Cancer_Type                      22
Cancer_Stage                      0
Treatment_Cost_USD              206
Survival_Years                    0
Target_Severity_Score_Status      0
Age_Group                         0
Era                               0
dtype: int64

In [71]:
#Droping NAN values  less than 5% from the dataset (droping the whole row)

x_train.dropna(subset= ["Cancer_Type", "Gender" ], inplace= True , ignore_index= True)
x_test.dropna(subset= ["Cancer_Type", "Gender" ], inplace= True , ignore_index= True)


In [72]:
x_train.duplicated().sum()

np.int64(0)

In [73]:
x_test.duplicated().sum()

np.int64(0)

## Numerical

### Impute Missing

In [74]:
x_train.isna().sum()

Age                               0
Gender                            0
Country_Region                  709
Year                              0
Genetic_Risk                      0
Air_Pollution                     0
Alcohol_Use                       0
Smoking                           0
Obesity_Level                   709
Cancer_Type                       0
Cancer_Stage                      0
Treatment_Cost_USD              709
Survival_Years                    0
Target_Severity_Score_Status      0
Age_Group                         0
Era                               0
dtype: int64

In [75]:
x_test.isna().sum()

Age                               0
Gender                            0
Country_Region                  183
Year                              0
Genetic_Risk                      0
Air_Pollution                     0
Alcohol_Use                       0
Smoking                           0
Obesity_Level                   183
Cancer_Type                       0
Cancer_Stage                      0
Treatment_Cost_USD              183
Survival_Years                    0
Target_Severity_Score_Status      0
Age_Group                         0
Era                               0
dtype: int64

In [76]:
num_cols = ["Treatment_Cost_USD" , "Obesity_Level"]


In [77]:
from sklearn.impute import KNNImputer

knn = KNNImputer()

x_train[num_cols] = knn.fit_transform(x_train[num_cols])

x_test[num_cols] = knn.transform(x_test[num_cols])


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul


divide by zero encountered in matmul


overflow encountered in matmul


invalid value encountered in matmul



In [78]:
x_train["Treatment_Cost_USD"].isna().sum()


np.int64(0)

In [79]:
x_test["Treatment_Cost_USD"].isna().sum()

np.int64(0)

In [80]:
px.histogram(data_frame= x_train, x= 'Obesity_Level')

In [None]:
# Scaling is a step should be done here however mt dataset doesn't need this step

## Categorical

### Impute Missing

In [81]:
cat_cols = ["Country_Region"]


In [82]:
Country_mode =  df.Country_Region.mode()[0]

In [83]:
x_train.Country_Region.fillna(Country_mode , inplace= True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [84]:
x_test.Country_Region.fillna(Country_mode , inplace= True)


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





In [85]:
x_train.Country_Region.isna().sum()

np.int64(0)

In [86]:
x_test.Country_Region.isna().sum()

np.int64(0)

In [87]:
x_train.isna().sum()

Age                             0
Gender                          0
Country_Region                  0
Year                            0
Genetic_Risk                    0
Air_Pollution                   0
Alcohol_Use                     0
Smoking                         0
Obesity_Level                   0
Cancer_Type                     0
Cancer_Stage                    0
Treatment_Cost_USD              0
Survival_Years                  0
Target_Severity_Score_Status    0
Age_Group                       0
Era                             0
dtype: int64

###  Encoding for Nominal Using one Hot Encoder

In [88]:
x_train.select_dtypes(include= 'object')

Unnamed: 0,Gender,Country_Region,Cancer_Type,Cancer_Stage,Target_Severity_Score_Status,Era
0,Male,India,Prostate,Stage IV,Not Achieved,2015-2018
1,Other,Canada,Skin,Stage II,Not Achieved,2015-2018
2,Female,Pakistan,Leukemia,Stage IV,Not Achieved,2015-2018
3,Male,USA,Liver,Stage 0,Achieved,2022-2024
4,Other,Brazil,Breast,Stage II,Not Achieved,2015-2018
...,...,...,...,...,...,...
39721,Other,Russia,Prostate,Stage III,Achieved,2015-2018
39722,Other,Canada,Colon,Stage II,Not Achieved,2022-2024
39723,Male,Brazil,Leukemia,Stage III,Not Achieved,2022-2024
39724,Male,Brazil,Leukemia,Stage II,Achieved,2022-2024


In [89]:
x_train.describe().info()

<class 'pandas.core.frame.DataFrame'>
Index: 8 entries, count to max
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Age                 8 non-null      float64
 1   Year                8 non-null      float64
 2   Genetic_Risk        8 non-null      float64
 3   Air_Pollution       8 non-null      float64
 4   Alcohol_Use         8 non-null      float64
 5   Smoking             8 non-null      float64
 6   Obesity_Level       8 non-null      float64
 7   Treatment_Cost_USD  8 non-null      float64
 8   Survival_Years      8 non-null      float64
dtypes: float64(9)
memory usage: 640.0+ bytes


In [90]:
for col in  x_train.select_dtypes(include= 'object').drop(["Country_Region" , "Cancer_Type" , "Cancer_Stage" , "Era"], axis=1).columns:
    
    print(col)
    print(x_train[col].nunique())

Gender
3
Target_Severity_Score_Status
2


In [91]:
ohe_cols = x_train.select_dtypes(include= 'object').drop(["Country_Region" ,"Cancer_Type"  ,"Cancer_Stage" ,"Era" , ] , axis = 1).columns
ohe_cols

Index(['Gender', 'Target_Severity_Score_Status'], dtype='object')

In [92]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output= False, drop= 'first')

ohe_train = ohe.fit_transform(x_train[ohe_cols])

ohe_test = ohe.transform(x_test[ohe_cols])

In [93]:
ohe.get_feature_names_out()

array(['Gender_Male', 'Gender_Other',
       'Target_Severity_Score_Status_Not Achieved'], dtype=object)

In [94]:
ohe_train_df = pd.DataFrame(ohe_train, columns= ohe.get_feature_names_out())

ohe_test_df = pd.DataFrame(ohe_test, columns= ohe.get_feature_names_out())

In [95]:
x_train.reset_index(inplace= True, drop= True)

x_test.reset_index(inplace= True, drop= True)

y_train.reset_index(inplace= True, drop= True)

y_test.reset_index(inplace= True, drop= True)

In [96]:
x_train = pd.concat([x_train, ohe_train_df], axis= 1)

x_test = pd.concat([x_test, ohe_test_df], axis= 1)

In [97]:
x_train

Unnamed: 0,Age,Gender,Country_Region,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Type,Cancer_Stage,Treatment_Cost_USD,Survival_Years,Target_Severity_Score_Status,Age_Group,Era,Gender_Male,Gender_Other,Target_Severity_Score_Status_Not Achieved
0,56,Male,India,2015,3.1,6.2,8.5,0.2,6.3,Prostate,Stage IV,56850.35,3.3,Not Achieved,50-70,2015-2018,1.0,0.0,1.0
1,38,Other,Canada,2017,6.5,3.3,4.9,1.8,5.5,Skin,Stage II,37692.90,1.3,Not Achieved,30-50,2015-2018,0.0,1.0,1.0
2,44,Female,Pakistan,2015,9.1,5.0,1.3,4.9,1.7,Leukemia,Stage IV,56357.84,0.7,Not Achieved,30-50,2015-2018,0.0,0.0,1.0
3,36,Male,USA,2024,9.2,2.4,7.7,4.2,7.5,Liver,Stage 0,55547.96,2.7,Achieved,30-50,2022-2024,1.0,0.0,0.0
4,80,Other,Brazil,2015,1.5,3.1,6.0,7.1,3.7,Breast,Stage II,90084.26,1.0,Not Achieved,70+,2015-2018,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39721,32,Other,Russia,2015,0.9,8.3,9.9,6.8,9.6,Prostate,Stage III,43578.01,9.1,Achieved,30-50,2015-2018,0.0,1.0,0.0
39722,64,Other,Canada,2022,4.8,3.6,1.3,3.0,8.4,Colon,Stage II,14273.90,0.8,Not Achieved,50-70,2022-2024,0.0,1.0,1.0
39723,66,Male,Brazil,2024,2.5,2.1,7.3,1.6,9.0,Leukemia,Stage III,91880.40,8.2,Not Achieved,50-70,2022-2024,1.0,0.0,1.0
39724,21,Male,Brazil,2022,3.8,1.4,4.2,7.6,8.2,Leukemia,Stage II,34971.61,3.1,Achieved,<30,2022-2024,1.0,0.0,0.0


###  Encoding for Nominal Using Binary Encoder

In [98]:
BE_cols = x_train.select_dtypes(include= 'object').drop(["Gender" ,"Target_Severity_Score_Status"  , "Cancer_Stage"] , axis = 1).columns
BE_cols

Index(['Country_Region', 'Cancer_Type', 'Era'], dtype='object')

In [99]:
from category_encoders import BinaryEncoder

be = BinaryEncoder()

be_train_df = be.fit_transform(x_train[BE_cols ])

be_test_df = be.transform(x_test[BE_cols ])

In [100]:
x_train = pd.concat([x_train, be_train_df], axis= 1).drop(BE_cols , axis = 1)

x_test = pd.concat([x_test, be_test_df], axis= 1).drop(BE_cols , axis = 1)

In [101]:
x_train

Unnamed: 0,Age,Gender,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Stage,Treatment_Cost_USD,...,Country_Region_0,Country_Region_1,Country_Region_2,Country_Region_3,Cancer_Type_0,Cancer_Type_1,Cancer_Type_2,Cancer_Type_3,Era_0,Era_1
0,56,Male,2015,3.1,6.2,8.5,0.2,6.3,Stage IV,56850.35,...,0,0,0,1,0,0,0,1,0,1
1,38,Other,2017,6.5,3.3,4.9,1.8,5.5,Stage II,37692.90,...,0,0,1,0,0,0,1,0,0,1
2,44,Female,2015,9.1,5.0,1.3,4.9,1.7,Stage IV,56357.84,...,0,0,1,1,0,0,1,1,0,1
3,36,Male,2024,9.2,2.4,7.7,4.2,7.5,Stage 0,55547.96,...,0,1,0,0,0,1,0,0,1,0
4,80,Other,2015,1.5,3.1,6.0,7.1,3.7,Stage II,90084.26,...,0,1,0,1,0,1,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39721,32,Other,2015,0.9,8.3,9.9,6.8,9.6,Stage III,43578.01,...,1,0,1,0,0,0,0,1,0,1
39722,64,Other,2022,4.8,3.6,1.3,3.0,8.4,Stage II,14273.90,...,0,0,1,0,0,1,1,0,1,0
39723,66,Male,2024,2.5,2.1,7.3,1.6,9.0,Stage III,91880.40,...,0,1,0,1,0,0,1,1,1,0
39724,21,Male,2022,3.8,1.4,4.2,7.6,8.2,Stage II,34971.61,...,0,1,0,1,0,0,1,1,1,0


###  Encoding for Ordinal Using Ordinal Encoding

In [102]:
x_train["Cancer_Stage"]

0         Stage IV
1         Stage II
2         Stage IV
3          Stage 0
4         Stage II
           ...    
39721    Stage III
39722     Stage II
39723    Stage III
39724     Stage II
39725    Stage III
Name: Cancer_Stage, Length: 39726, dtype: object

In [103]:
from sklearn.preprocessing import OrdinalEncoder

ord = OrdinalEncoder(categories=[["Stage 0", "Stage I", "Stage II", "Stage III", "Stage IV"]])

x_train["Cancer_Stage"] = x_train["Cancer_Stage"].astype(str).str.strip()
x_test["Cancer_Stage"] = x_test["Cancer_Stage"].astype(str).str.strip()

x_train["Cancer_Stage"] = ord.fit_transform(x_train[["Cancer_Stage"]])
x_test["Cancer_Stage"] = ord.transform(x_test[["Cancer_Stage"]])


In [104]:
x_train["Cancer_Stage"]

0        4.0
1        2.0
2        4.0
3        0.0
4        2.0
        ... 
39721    3.0
39722    2.0
39723    3.0
39724    2.0
39725    3.0
Name: Cancer_Stage, Length: 39726, dtype: float64

In [105]:
x_train.head()

Unnamed: 0,Age,Gender,Year,Genetic_Risk,Air_Pollution,Alcohol_Use,Smoking,Obesity_Level,Cancer_Stage,Treatment_Cost_USD,...,Country_Region_0,Country_Region_1,Country_Region_2,Country_Region_3,Cancer_Type_0,Cancer_Type_1,Cancer_Type_2,Cancer_Type_3,Era_0,Era_1
0,56,Male,2015,3.1,6.2,8.5,0.2,6.3,4.0,56850.35,...,0,0,0,1,0,0,0,1,0,1
1,38,Other,2017,6.5,3.3,4.9,1.8,5.5,2.0,37692.9,...,0,0,1,0,0,0,1,0,0,1
2,44,Female,2015,9.1,5.0,1.3,4.9,1.7,4.0,56357.84,...,0,0,1,1,0,0,1,1,0,1
3,36,Male,2024,9.2,2.4,7.7,4.2,7.5,0.0,55547.96,...,0,1,0,0,0,1,0,0,1,0
4,80,Other,2015,1.5,3.1,6.0,7.1,3.7,2.0,90084.26,...,0,1,0,1,0,1,0,1,0,1


In [106]:
y_train

0        4.36
1        4.69
2        4.79
3        5.83
4        3.65
         ... 
39874    6.36
39875    4.85
39876    3.29
39877    5.24
39878    3.78
Name: Target_Severity_Score, Length: 39879, dtype: float64