In [12]:
import pandas as pd
import numpy as np

# Let's create a simulated dataset with the given structure and add noise
np.random.seed(42)

# Sample data structure from the provided dataset
data = {
    'Comfort_food': ['Ice Cream', 'Pizza', 'Chocolate', 'Burger', 'Sushi', 'Fries', 'Cake', 'Pasta', 'Soup', 'Steak'] * 14,
    'Comfort_food_reasons': ['Stress', 'Sadness', 'Celebration', 'Boredom', 'Happiness', 'Loneliness', 'Fatigue', 'Socialization', 'Hunger', 'Nostalgia'] * 14,
    'Calorie_level': np.random.randint(200, 1000, 140),
    'Age': np.random.randint(18, 65, 140),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], 140),
    'Exercise_frequency': np.random.choice(['Never', 'Rarely', 'Sometimes', 'Often'], 140),
    'Satisfaction_level': np.random.randint(1, 10, 140),
    'Weight_change': np.random.choice(['Gain', 'Loss', 'No change'], 140),
    'Height': np.round(np.random.uniform(150, 200, 140), 2),
    'Body_mass_index': np.round(np.random.uniform(18, 35, 140), 2)
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Simulate additional rows
new_rows = 700 - len(df)
additional_data = {
    'Comfort_food': np.random.choice(data['Comfort_food'], new_rows),
    'Comfort_food_reasons': np.random.choice(data['Comfort_food_reasons'], new_rows),
    'Calorie_level': np.random.randint(200, 1000, new_rows),
    'Age': np.random.randint(18, 65, new_rows),
    'Gender': np.random.choice(['Male', 'Female', 'Other'], new_rows),
    'Exercise_frequency': np.random.choice(['Never', 'Rarely', 'Sometimes', 'Often'], new_rows),
    'Satisfaction_level': np.random.randint(1, 10, new_rows),
    'Weight_change': np.random.choice(['Gain', 'Loss', 'No change'], new_rows),
    'Height': np.round(np.random.uniform(150, 200, new_rows), 2),
    'Body_mass_index': np.round(np.random.uniform(18, 35, new_rows), 2)
}

# Create additional dataframe and append to original
df_additional = pd.DataFrame(additional_data)
df_full = pd.concat([df, df_additional], ignore_index=True)

# Introduce missing values and uncleaned data (e.g., some textual errors, NaN values)
def introduce_noise(df, noise_fraction=0.045):
    df_noisy = df.copy()
    # Randomly introduce NaN in 5% of the data
    for col in df.columns:
        nan_indices = df_noisy.sample(frac=noise_fraction).index
        df_noisy.loc[nan_indices, col] = np.nan

    # Introduce some textual noise in 'Comfort_food' and 'Exercise_frequency' columns
    df_noisy['Comfort_food'] = df_noisy['Comfort_food'].replace({'Ice Cream': 'ice_cream', 'Pizza': 'Pizzzza'}, regex=True)
    df_noisy['Exercise_frequency'] = df_noisy['Exercise_frequency'].replace({'Often': 'Offten'}, regex=True)
    
    return df_noisy

# Apply noise to the dataset
df_noisy = introduce_noise(df_full)

# Checking the shape of the noisy dataset
print(df_noisy.shape)


(700, 10)


In [13]:
df_noisy

Unnamed: 0,Comfort_food,Comfort_food_reasons,Calorie_level,Age,Gender,Exercise_frequency,Satisfaction_level,Weight_change,Height,Body_mass_index
0,ice_cream,Stress,302.0,18.0,Male,Never,3.0,Gain,197.48,33.80
1,Pizzzza,Sadness,635.0,52.0,Other,,1.0,No change,157.35,32.12
2,Chocolate,Celebration,470.0,54.0,Other,,1.0,Loss,196.33,34.41
3,Burger,Boredom,306.0,64.0,Male,Rarely,3.0,Loss,174.61,20.11
4,Sushi,Happiness,,,Male,Sometimes,5.0,Gain,162.91,30.42
...,...,...,...,...,...,...,...,...,...,...
695,Steak,Sadness,732.0,59.0,Male,Never,3.0,Loss,,34.43
696,Cake,Boredom,749.0,34.0,Other,Sometimes,8.0,Loss,191.38,34.06
697,Pizzzza,Boredom,987.0,54.0,,Sometimes,6.0,No change,150.03,23.69
698,Burger,Loneliness,704.0,56.0,Female,Sometimes,1.0,No change,165.39,27.70


In [14]:
df_noisy.to_csv("food.csv",index = False)