In [None]:
import pandas as pd
import numpy as np 

from sklearn.impute import SimpleImputer

def preprocess(inpath):
    train = pd.read_csv(inpath + "/train.csv")
    base = inpath + "/train_images/"
    
    # Clean up
    # dropping columns that only occur in train: density, biopsy, invasive, BIRADS, difficult_negative_case

    train = train.drop(['density', 'biopsy', 'invasive', 'BIRADS', 'difficult_negative_case'], axis = 1)
    
    # add image paths
    train['path'] = base + train['patient_id'].astype(str) + "/" + train['image_id'].astype(str) + ".dcm"
    
    # imputing missing values
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    train = pd.DataFrame(imp.fit_transform(train), columns = train.columns)

    # Rearranging data 

    # drop rows with view different from CC or MLO
    train = train[(train.view == 'CC') | (train.view == 'MLO')]

    # create new columns 'breast_id', 'path_CC', 'path_MLO'
    train['breast_id'] = train['patient_id'].astype(str) + "_" + train['laterality'].astype(str)
    train['path_CC'] = train.query("view=='CC'")["path"] # rows with view = MLO will be nan
    train['path_MLO'] = train.query("view=='MLO'")["path"] # rows with view = CC will be nan

    # convert laterality to one hot encoding
    temp = pd.get_dummies(train['laterality'])
    temp = temp.drop(['L'], axis = 1)
    train = train.drop(['laterality'], axis = 1)
    train['laterality'] = temp

    # fill nan values in path_CC by path_CC values in rows with same breast_id 
    train['path_CC'] = train.groupby(['breast_id'])['path_CC'].bfill()
    train['path_CC'] = train.groupby(['breast_id'])['path_CC'].ffill()

    # the only column with nan values is path_MLO. The path_CC value in these rows has been copied to the row with matching breast_id. We can safely drop the rows with nan MLO values. 
    train = train.dropna()

    # drop rows with information already covered by other rows
    train = train.drop(['patient_id', 'image_id', 'view', 'path'], axis = 1)
    train = train.reset_index()
    train = train.drop('index',  axis = 1)

    # rearranging columns
    train = train[['breast_id', 'laterality', 'age', 'implant', 'site_id', 'machine_id', 'path_CC', 'path_MLO', 'cancer']]
    train['cancer'] = train['cancer'].astype(float)

    return train

