In [1]:
import pandas as pd
import numpy as np

In [2]:
raw_df = pd.read_csv('../../data/raw/dataset.csv')
print('shape : ',raw_df.shape)
print('Columns : ',raw_df.columns)

shape :  (40000, 13)
Columns :  Index(['age_c', 'assess_c', 'cancer_c', 'compfilm_c', 'density_c', 'famhx_c',
       'hrt_c', 'prvmam_c', 'biophx_c', 'mammtype', 'CaTypeO', 'bmi_c',
       'ptid'],
      dtype='object')


In [3]:
#droping patient id
raw_df=raw_df.drop('ptid',axis=1)
raw_df.shape

(40000, 12)

In [4]:
#Replacing splcodes with NaN
raw_df.replace({
    'compfilm_c': {9:np.nan},
    'famhx_c': {9:np.nan},
    'hrt_c': {9:np.nan}, 
    'prvmam_c': {9:np.nan}, 
    'biophx_c': {9:np.nan},
    'bmi_c' : {-99.:np.nan},
},inplace = True)

print('Uniques values of each col ')
for column in raw_df.columns:
    unique_values = raw_df[column].unique()
    print(f"{column} : {unique_values}")

print('\ncounts of missing values')
print(raw_df.isna().sum())

Uniques values of each col 
age_c : [62 65 69 64 63 75 66 78 70 67 74 61 71 77 60 73 81 79 72 80 68 83 85 82
 84 86 76 87 89 88]
assess_c : [1 0 2 3 4 5]
cancer_c : [0 1]
compfilm_c : [ 1.  0. nan]
density_c : [2 4 3 1]
famhx_c : [ 0.  1. nan]
hrt_c : [ 0. nan  1.]
prvmam_c : [ 1.  0. nan]
biophx_c : [ 0.  1. nan]
mammtype : [1 2]
CaTypeO : [8 2 1]
bmi_c : [24.0235443        nan 29.0524292 ... 29.8625793 19.6293335 35.680542 ]

counts of missing values
age_c             0
assess_c          0
cancer_c          0
compfilm_c     4680
density_c         0
famhx_c         228
hrt_c          1772
prvmam_c        578
biophx_c        815
mammtype          0
CaTypeO           0
bmi_c         23209
dtype: int64


In [5]:
'''50% of the bmi's are missing....removing the bmi_c ''' 
raw_df = raw_df.drop('bmi_c', axis =1)
print(raw_df.shape)

(40000, 11)


In [6]:
#missing values handling....maximum likelihood estimation

# Function to initialize missing values
def initialize_missing_values(raw_df):
    return raw_df.apply(lambda x: x.fillna(x.mean()), axis=0)

# E-step: Expectation of missing values
def expectation(raw_df, means, variances):
    for col in raw_df.columns:
        mask = raw_df[col].isna()
        raw_df.loc[mask, col] = means[col]
    return raw_df

# M-step: Maximization step to update parameters
def maximization(raw_df):
    means = raw_df.mean()
    variances = raw_df.var()
    return means, variances

# EM algorithm for MLE-based imputation
def em_algorithm(raw_df, max_iter=100, tol=1e-6):
    df_filled = initialize_missing_values(raw_df.copy())
    prev_means, prev_variances = maximization(df_filled)
    
    for i in range(max_iter):
        df_filled = expectation(df_filled, prev_means, prev_variances)
        means, variances = maximization(df_filled)
        
        if np.all(np.abs(prev_means - means) < tol) and np.all(np.abs(prev_variances - variances) < tol):
            break
        
        prev_means, prev_variances = means, variances
    
    return df_filled, means, variances

# Run the EM algorithm
imputed_df, final_means, final_variances = em_algorithm(raw_df)

print("Imputed Data:")
imputed_df.to_csv('../../data/interim/dataset.csv')
print(imputed_df.head())
print("\nFinal Means:")
print(final_means)
print("\nFinal Variances:")
print(final_variances)

  raw_df.loc[mask, col] = means[col]
  raw_df.loc[mask, col] = means[col]
  raw_df.loc[mask, col] = means[col]
  raw_df.loc[mask, col] = means[col]
  raw_df.loc[mask, col] = means[col]
  raw_df.loc[mask, col] = means[col]


Imputed Data:
   age_c  assess_c  cancer_c  compfilm_c  density_c  famhx_c  hrt_c  prvmam_c  \
0   62.0       1.0       0.0         1.0        2.0      0.0    0.0       1.0   
1   65.0       1.0       0.0         1.0        4.0      0.0    0.0       1.0   
2   69.0       0.0       0.0         1.0        2.0      0.0    0.0       1.0   
3   64.0       2.0       0.0         1.0        2.0      0.0    0.0       1.0   
4   63.0       3.0       0.0         1.0        2.0      0.0    0.0       1.0   

   biophx_c  mammtype  CaTypeO  
0       0.0       1.0      8.0  
1       0.0       1.0      8.0  
2       0.0       1.0      8.0  
3       0.0       1.0      8.0  
4       1.0       1.0      8.0  

Final Means:
age_c         69.555850
assess_c       1.203450
cancer_c       0.006475
compfilm_c     0.963137
density_c      2.229725
famhx_c        0.169541
hrt_c          0.111149
prvmam_c       0.992492
biophx_c       0.266735
mammtype       1.500000
CaTypeO        7.959750
dtype: float64

Final V