In [33]:
# portfolio Project
import boto3
import pandas as pd
import io

In [34]:
# Initialize S3 client
s3 = boto3.client('s3')

# Specify the bucket name and file key
bucket_name = 'bihani-portfolio-project-1'
file_key = 'oral_cancer_prediction_dataset.csv'

# Get the object from S3
csv_obj = s3.get_object(Bucket=bucket_name, Key=file_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8')

# Read the CSV into a Pandas DataFrame
df = pd.read_csv(io.StringIO(csv_string))

# Quick look at the DataFrame
print(df.head())

   ID       Country  Age  Gender Tobacco Use Alcohol Consumption  \
0   1         Italy   36  Female         Yes                 Yes   
1   2         Japan   64    Male         Yes                 Yes   
2   3            UK   37  Female          No                 Yes   
3   4     Sri Lanka   55    Male         Yes                 Yes   
4   5  South Africa   68    Male          No                  No   

  HPV Infection Betel Quid Use Chronic Sun Exposure Poor Oral Hygiene  \
0           Yes             No                   No               Yes   
1           Yes             No                  Yes               Yes   
2            No             No                  Yes               Yes   
3            No            Yes                   No               Yes   
4            No             No                   No               Yes   

  Diet (Fruits & Vegetables Intake) Family History of Cancer  \
0                               Low                       No   
1                       

Perfrom EDA

In [35]:
df.describe()

Unnamed: 0,ID,Age,Tumor Size (cm),Cancer Stage,"Survival Rate (5-Year, %)",Cost of Treatment (USD),Economic Burden (Lost Workdays per Year)
count,84922.0,84922.0,84922.0,84922.0,84922.0,84922.0,84922.0
mean,42461.5,54.509444,1.747294,1.118756,79.503364,39109.881244,52.028391
std,24515.014117,10.014839,2.028199,1.340418,26.483746,44710.687054,60.389796
min,1.0,15.0,0.0,0.0,10.00075,0.0,0.0
25%,21231.25,48.0,0.0,0.0,65.233425,0.0,0.0
50%,42461.5,55.0,0.0,0.0,100.0,0.0,0.0
75%,63691.75,61.0,3.480075,2.0,100.0,76468.4375,104.0
max,84922.0,101.0,5.999899,4.0,100.0,159988.0,179.0


In [36]:
df.shape

(84922, 25)

In [37]:
df.dtypes

ID                                            int64
Country                                      object
Age                                           int64
Gender                                       object
Tobacco Use                                  object
Alcohol Consumption                          object
HPV Infection                                object
Betel Quid Use                               object
Chronic Sun Exposure                         object
Poor Oral Hygiene                            object
Diet (Fruits & Vegetables Intake)            object
Family History of Cancer                     object
Compromised Immune System                    object
Oral Lesions                                 object
Unexplained Bleeding                         object
Difficulty Swallowing                        object
White or Red Patches in Mouth                object
Tumor Size (cm)                             float64
Cancer Stage                                  int64
Treatment Ty

In [38]:
print(df.isnull().sum())

ID                                          0
Country                                     0
Age                                         0
Gender                                      0
Tobacco Use                                 0
Alcohol Consumption                         0
HPV Infection                               0
Betel Quid Use                              0
Chronic Sun Exposure                        0
Poor Oral Hygiene                           0
Diet (Fruits & Vegetables Intake)           0
Family History of Cancer                    0
Compromised Immune System                   0
Oral Lesions                                0
Unexplained Bleeding                        0
Difficulty Swallowing                       0
White or Red Patches in Mouth               0
Tumor Size (cm)                             0
Cancer Stage                                0
Treatment Type                              0
Survival Rate (5-Year, %)                   0
Cost of Treatment (USD)           

In [39]:
df.columns

Index(['ID', 'Country', 'Age', 'Gender', 'Tobacco Use', 'Alcohol Consumption',
       'HPV Infection', 'Betel Quid Use', 'Chronic Sun Exposure',
       'Poor Oral Hygiene', 'Diet (Fruits & Vegetables Intake)',
       'Family History of Cancer', 'Compromised Immune System', 'Oral Lesions',
       'Unexplained Bleeding', 'Difficulty Swallowing',
       'White or Red Patches in Mouth', 'Tumor Size (cm)', 'Cancer Stage',
       'Treatment Type', 'Survival Rate (5-Year, %)',
       'Cost of Treatment (USD)', 'Economic Burden (Lost Workdays per Year)',
       'Early Diagnosis', 'Oral Cancer (Diagnosis)'],
      dtype='object')

In [40]:
# Convert binary categorical variables to numeric
binary_cols = ['Tobacco Use', 'Alcohol Consumption', 'HPV Infection', 'Betel Quid Use', 
               'Chronic Sun Exposure', 'Poor Oral Hygiene', 'Family History of Cancer',
               'Compromised Immune System', 'Oral Lesions', 'Unexplained Bleeding',
               'Difficulty Swallowing', 'White or Red Patches in Mouth', 'Early Diagnosis',
               'Oral Cancer (Diagnosis)']
for col in binary_cols:
    df[col] = df[col].map({'Yes': 1, 'No': 0})

In [46]:
print("\nDiet categories:")
print(df['Diet (Fruits & Vegetables Intake)'].unique())


Diet categories:
[1 3 2]


In [41]:
# Ordinal encoding for Diet
diet_mapping = {'Low': 1, 'Moderate': 2, 'High': 3}
df['Diet (Fruits & Vegetables Intake)'] = df['Diet (Fruits & Vegetables Intake)'].map(diet_mapping)

In [42]:
# Convert Cancer Stage into categorical type
df['Cancer Stage'] = df['Cancer Stage'].astype('category')

In [43]:
# Convert numeric columns explicitly
numeric_cols = ['Age', 'Tumor Size (cm)', 'Survival Rate (5-Year, %)', 
                'Cost of Treatment (USD)', 'Economic Burden (Lost Workdays per Year)']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [44]:
# Drop ID column (no analytical use)
df.drop(columns=['ID'], inplace=True)

In [45]:
df.head()

Unnamed: 0,Country,Age,Gender,Tobacco Use,Alcohol Consumption,HPV Infection,Betel Quid Use,Chronic Sun Exposure,Poor Oral Hygiene,Diet (Fruits & Vegetables Intake),Family History of Cancer,Compromised Immune System,Oral Lesions,Unexplained Bleeding,Difficulty Swallowing,White or Red Patches in Mouth,Tumor Size (cm),Cancer Stage,Treatment Type,"Survival Rate (5-Year, %)",Cost of Treatment (USD),Economic Burden (Lost Workdays per Year),Early Diagnosis,Oral Cancer (Diagnosis)
0,Italy,36,Female,1,1,1,0,0,1,1,0,0,0,0,0,0,0.0,0,No Treatment,100.0,0.0,0,0,0
1,Japan,64,Male,1,1,1,0,1,1,3,0,0,0,1,0,0,1.782186,1,No Treatment,83.340103,77772.5,177,0,1
2,UK,37,Female,0,1,0,0,1,1,2,0,0,0,0,0,1,3.523895,2,Surgery,63.222871,101164.5,130,1,1
3,Sri Lanka,55,Male,1,1,0,1,0,1,2,0,0,1,0,0,0,0.0,0,No Treatment,100.0,0.0,0,1,0
4,South Africa,68,Male,0,0,0,0,0,1,3,0,0,0,0,0,0,2.834789,3,No Treatment,44.293199,45354.75,52,0,1


In [47]:
print("Treatment Type categories:")
print(df['Treatment Type'].unique())

Treatment Type categories:
['No Treatment' 'Surgery' 'Radiation' 'Targeted Therapy' 'Chemotherapy']
