In [None]:
import pandas as pd

In [None]:
# 1. Load the risk mapping from the text file
risk_map = {}
with open(r"C:\Frank\UoT 2024-2025\MIE8888 Project\M13\risk_levels_RUIWU.txt", 'r') as f:
    for line in f:
        line = line.strip()
        # Skip headers and blank or separator lines
        if not line or line.startswith("procedure_code") or line.startswith("#####"):
            continue
        parts = line.split(',')
        if len(parts) == 2:
            code = parts[0].strip()
            try:
                risk = float(parts[1].strip())
            except ValueError:
                risk = 0.0
            risk_map[code] = risk

In [7]:
risk_map

{'11101': -3.0,
 '11107': -2.0,
 '11111': -1.0,
 '11112': -3.0,
 '11113': -2.0,
 '11114': -1.0,
 '11115': -3.0,
 '11116': -2.0,
 '12111': 0.0,
 '16511': -2.0,
 '16401': -1.0,
 '16202': -1.0,
 '16201': -1.0,
 '16102': 0.0,
 '16101': 0.0,
 '33141': 6.0,
 '33135': 6.0,
 '33134': 5.0,
 '33133': 6.0,
 '33131': 6.0,
 '33125': 6.0,
 '33124': 6.0,
 '33122': 5.0,
 '33121': 4.0,
 '33115': 6.0,
 '33114': 6.0,
 '33113': 5.0,
 '33112': 6.0,
 '33111': 4.0,
 '33105': 6.0,
 '33104': 6.0,
 '33102': 6.0,
 '33101': 4.0}

In [37]:
# 2. Load the full patient data from CSV
# df = pd.read_csv(r"C:\Frank\UoT 2024-2025\MIE8888 Project\Data\Complete Data\21_treatements_backup_joined_patients_procedure_na_removed.csv")
df = pd.read_csv(r"C:\Frank\UoT 2024-2025\MIE8888 Project\Data\Complete Data\Mytestdataset.csv")
df['procedure_year'] = pd.to_datetime(df['procedure_date']).dt.year
df['age_at_filling'] = df['procedure_year'] - pd.to_datetime(df['birth_date']).dt.year

In [39]:
# 3. Map each treatment's procedure code to its risk value.
df['risk'] = df['procedure_code_x'].astype(str).map(risk_map)
# fill missing risk values with 0 (if any procedure code is not found in risk_map).
df['risk'] = df['risk'].fillna(0)

In [40]:
df.head()

Unnamed: 0,patient_id,birth_date,first_visit,last_visit,procedure_code_x,procedure_date,procedure_year,age_at_filling,risk
0,1,1955-06-10 00:00:00+00:00,1991-04-30T00:00:00,2013-04-30T00:00:00,449,2004-04-19 00:00:00+00:00,2004,49.0,0.0
1,1,1955-06-10 00:00:00+00:00,1991-04-30T00:00:00,2013-04-30T00:00:00,713,2004-04-19 00:00:00+00:00,2004,49.0,0.0
2,1,1955-06-10 00:00:00+00:00,1991-04-30T00:00:00,2013-04-30T00:00:00,786,2004-04-19 00:00:00+00:00,2004,49.0,0.0
3,1,1955-06-10 00:00:00+00:00,1991-04-30T00:00:00,2013-04-30T00:00:00,1357,2004-04-19 00:00:00+00:00,2004,49.0,0.0
4,1,1955-06-10 00:00:00+00:00,1991-04-30T00:00:00,2013-04-30T00:00:00,5212,2007-07-11 00:00:00+00:00,2007,52.0,0.0


In [48]:
# Get a txt doc of procedure_code with 0 risk level
zero_risk_procedure_codes = []
for i, r in enumerate(df['risk']):
    if r == 0:
        zero_risk_procedure_codes.append(df['procedure_code_x'][i])

with open("zero_risk_procedure_codes.txt", "w") as file:
    for code in zero_risk_procedure_codes:
        file.write(str(code) + "\n")

In [49]:
# Get a txt doc of procedure_code with 0 risk level
non_zero_risk_procedure_codes = []
for i, r in enumerate(df['risk']):
    if r != 0:
        non_zero_risk_procedure_codes.append(df['procedure_code_x'][i])

with open("non_zero_risk_procedure_codes.txt", "w") as file:
    for code in non_zero_risk_procedure_codes:
        file.write(str(code) + "\n")

In [43]:
# 4. Define a function to compute the final normalized risk for a single patient.
def compute_patient_risk(patient_df):
    # Determine the treatment timeline for the patient
    min_year = patient_df['procedure_year'].min()
    max_year = patient_df['procedure_year'].max()
    normalized_risks = []
    
    # Slide the window year by year from the first to the last treatment year.
    for current_year in range(min_year, max_year + 1):
        start_window = current_year - 5
        end_window = current_year + 2
        # Filter treatments that fall within the current window.
        window_df = patient_df[(patient_df['procedure_year'] >= start_window) & (patient_df['procedure_year'] <= end_window)]
        risk_sum = window_df['risk'].sum()
        # Normalize the risk sum to the 6-point scale.
        normalized_risk = risk_sum / 6
        normalized_risks.append(normalized_risk)
    
    # The final risk is the average of normalized risks from each window.
    if normalized_risks:
        final_risk = sum(normalized_risks) / len(normalized_risks)
    else:
        final_risk = 0
    return final_risk

In [44]:
patient_risks = df.groupby('patient_id').apply(compute_patient_risk)
patient_risk_df = patient_risks.reset_index()
patient_risk_df.columns = ['patient_id', 'final_risk']
print(patient_risk_df.head())

   patient_id  final_risk
0           1         0.0
1           2         0.0
2           3         0.0
3           4         0.0
4           5         0.0


  patient_risks = df.groupby('patient_id').apply(compute_patient_risk)


In [45]:
patient_risk_df

Unnamed: 0,patient_id,final_risk
0,1,0.0
1,2,0.0
2,3,0.0
3,4,0.0
4,5,0.0
...,...,...
10851,13580,0.0
10852,13583,0.0
10853,13584,0.0
10854,13586,0.0
