### Data Preprocessing and Cleaning for Generative AI

In [1]:
import pandas as pd
import numpy as np 

In [2]:
def load_sample_data():
    data = {
        'image_id': ['img1', 'img2', 'img3', 'img4', 'img5'],
        'category': ['cat', 'dog', np.nan, 'cat', 'dog'],
        'size_kb': [1200, 850, 900, np.nan, 1150],
        'resolution': ['1024x768', '800x600', '1024x768', '640x480', np.nan],
        'label': ['outdoor', 'indoor', 'indoor', np.nan, 'outdoor']
    }

    df = pd.DataFrame(data)
    print(df)
    return df

In [3]:
def handle_missing_data(df):
    df['category'] = df['category'].fillna('unknown')
    df['label'] = df['label'].fillna('unknown')

    median_size = df['size_kb'].median()
    df['size_kb'] = df['size_kb'].fillna(median_size)

    mode_res = df['resolution'].mode()[0]
    df['resolution'] = df['resolution'].fillna(mode_res)

    return df

In [None]:
def normalize_numerical(df, cols):
    for col in cols:
        min_val = df[col].min()
        max_val = df[col].max()
        df[col] = (df[col] - min_val) / (max_val - min_val)
    return df

In [9]:
from sklearn.preprocessing import MinMaxScaler

def normalize_numerical_sklearn(df, cols):
    scaler = MinMaxScaler()
    df_normalized = df.copy()
    for col in cols:
        df_normalized[col] = scaler.fit_transform(df[[col]]).flatten()
    return df_normalized

In [5]:
def encode_categorical(df, cols):
    df_encoded = pd.get_dummies(df, columns=cols, drop_first=True)
    return df_encoded

In [10]:
def main():
    df = load_sample_data()
    print("Original Data:")
    print(df,"\n")

    df = handle_missing_data(df)
    print("After Handling Missing Data:")
    print(df,"\n")

    df = normalize_numerical_sklearn(df, cols=['size_kb'])
    print("After Normalizing Numerical Features:")
    print(df,"\n")

    df_encoded = encode_categorical(df, cols=['category','resolution','label'])
    print("After Encoding Categorical Variables:")
    print(df_encoded)

In [11]:
main()

  image_id category  size_kb resolution    label
0     img1      cat   1200.0   1024x768  outdoor
1     img2      dog    850.0    800x600   indoor
2     img3      NaN    900.0   1024x768   indoor
3     img4      cat      NaN    640x480      NaN
4     img5      dog   1150.0        NaN  outdoor
Original Data:
  image_id category  size_kb resolution    label
0     img1      cat   1200.0   1024x768  outdoor
1     img2      dog    850.0    800x600   indoor
2     img3      NaN    900.0   1024x768   indoor
3     img4      cat      NaN    640x480      NaN
4     img5      dog   1150.0        NaN  outdoor 

After Handling Missing Data:
  image_id category  size_kb resolution    label
0     img1      cat   1200.0   1024x768  outdoor
1     img2      dog    850.0    800x600   indoor
2     img3  unknown    900.0   1024x768   indoor
3     img4      cat   1025.0    640x480  unknown
4     img5      dog   1150.0   1024x768  outdoor 

After Normalizing Numerical Features:
  image_id category   size_kb re