In [None]:
!pip install graphviz



In [None]:
# Install Graphviz system package
!apt-get install -y graphviz

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
graphviz is already the newest version (2.42.2-6ubuntu0.1).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.


In [None]:
import pandas as pd
import numpy as np
import re

In [None]:
from google.colab import files

# Upload the file
uploaded = files.upload()

Saving Dataset Final.xlsx to Dataset Final.xlsx


In [None]:
import os

# List files in the current directory
print(os.listdir())

['.config', 'Dataset Final.xlsx', 'sample_data']


In [None]:
file_path = 'Dataset Final.xlsx'

# Check available sheet names
xls = pd.ExcelFile(file_path)
print(xls.sheet_names)

['Meal Preferences', 'Morning Routine', 'Commute Patterns', 'Study Habits']


In [None]:
file_path = 'Dataset Final.xlsx'
df1 = pd.read_excel(file_path, sheet_name='Meal Preferences')
df2 = pd.read_excel(file_path, sheet_name='Morning Routine')
df3 = pd.read_excel(file_path, sheet_name='Commute Patterns')
df4 = pd.read_excel(file_path, sheet_name='Study Habits')

In [None]:
# Function to check data types and missing values
def check_data(df):
    print("Data Types:")
    print(df.dtypes)
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nPercentage of Missing Values:")
    print((df.isnull().sum() / len(df)) * 100)

In [None]:
print("Meal Preferences DataFrame:")
check_data(df1)

print("\nMorning Routine DataFrame:")
check_data(df2)

print("\nCommute Patterns DataFrame:")
check_data(df3)

print("\nStudy Habits DataFrame:")
check_data(df4)

Meal Preferences DataFrame:
Data Types:
Date                        datetime64[ns]
Time of Meal                        object
Type of Meal                        object
Main Dish                           object
Side Dish(es)                       object
Beverage                            object
Calories Consumed (kcal)           float64
Healthiness Rating                  object
Satisfaction Level                  object
Mood Before Meal                    object
Mood After Meal                     object
Location of Meal                    object
Reason for Meal Choice              object
Company During Meal                 object
dtype: object

Missing Values:
Date                         0
Time of Meal                 0
Type of Meal                 0
Main Dish                    0
Side Dish(es)               21
Beverage                    27
Calories Consumed (kcal)    36
Healthiness Rating           0
Satisfaction Level           0
Mood Before Meal             0
Mood After Meal  

In [None]:
# Fill missing values in the 'Side Dish(es)' column
df1['Side Dish(es)'] = df1['Side Dish(es)'].fillna('No Side Dish')  # Fill missing side dishes

# Fill missing values in the 'Beverage' column
df1['Beverage'] = df1['Beverage'].fillna('No Beverage')  # Fill missing beverages

# Fill missing values in 'Calories Consumed (kcal)' with the mean
df1['Calories Consumed (kcal)'] = df1['Calories Consumed (kcal)'].fillna(df1['Calories Consumed (kcal)'].mean())

print("\nAfter filling missing values in Meal Preferences DataFrame:")
check_data(df1)


After filling missing values in Meal Preferences DataFrame:
Data Types:
Date                        datetime64[ns]
Time of Meal                        object
Type of Meal                        object
Main Dish                           object
Side Dish(es)                       object
Beverage                            object
Calories Consumed (kcal)           float64
Healthiness Rating                  object
Satisfaction Level                  object
Mood Before Meal                    object
Mood After Meal                     object
Location of Meal                    object
Reason for Meal Choice              object
Company During Meal                 object
dtype: object

Missing Values:
Date                        0
Time of Meal                0
Type of Meal                0
Main Dish                   0
Side Dish(es)               0
Beverage                    0
Calories Consumed (kcal)    0
Healthiness Rating          0
Satisfaction Level          0
Mood Before Meal        

In [None]:
clean_df1 = df1.copy()

In [None]:
def convert_to_minutes(duration):
    if isinstance(duration, str):
        # Normalize the string by removing spaces and converting to lowercase
        duration = duration.strip().lower()

        # Extract digits from the string
        match = re.findall(r'\d+', duration)  # Finds all digit sequences in the string
        if match:
            number = int(match[0])  # Take the first found number
            if 'hour' in duration:
                return number * 60  # Convert hours to minutes
            elif 'min' in duration or 'minute' in duration:
                return number  # Already in minutes
            elif duration == 'all night':
                return 480  # Assuming 'All night' is 8 hours
        else:
            # Handle specific cases like '5mins'
            match = re.findall(r'\d+', duration.replace('mins', ''))
            if match:
                return int(match[0])  # Extract and return as minutes

    elif isinstance(duration, (int, float)):
        return int(duration)  # Return as-is if it's already a number
    return None  # For non-convertible formats

In [None]:
# Print the first 5 rows of df2
print(df2.head())


        Date Time Woken Up First Activity Duration of First Activity (min)  \
0 2024-08-12      06:00:00        Bathing                               20   
1 2024-08-13      06:15:00       Bathing                                20   
2 2024-08-14      06:05:00        Bathing                               15   
3 2024-08-15      06:15:00        Bathing                               19   
4 2024-08-16      06:03:00        Bathing                               15   

  Time Started Breakfast Type of Breakfast  Calories Consumed (kcal)  \
0               07:55:00          Fakcakes                     200.0   
1               06:45:00            Cereal                     250.0   
2               06:35:00  Left over dinner                       NaN   
3               06:30:00            Cereal                     200.0   
4               06:40:00  Left over dinner                       NaN   

  Healthiness Rating Exercise Performed Type of Exercise  \
0            Healthy                 N

In [None]:

print(df2['Healthiness Rating'].unique())


['Healthy' 'healthy' nan 'Moderately healthy' 'Moderate'
 'Moderately Healthy' 'Unhealthy']


In [None]:
# Step 1: Normalize the case (convert everything to lowercase)
df2['Healthiness Rating'] = df2['Healthiness Rating'].str.lower()

# Step 2: Define the mapping (treat 'healthy' and 'Moderately Healthy' as the same category)
rating_mapping = {
    'healthy': 1,
    'moderately healthy': 2,
    'unhealthy': 3
}

# Apply the mapping to the column
df2['Healthiness Rating'] = df2['Healthiness Rating'].map(rating_mapping)

# Step 3: Check for any missing values in the column after mapping
missing_values = df2['Healthiness Rating'].isnull().sum()
print(f"Missing values in 'Healthiness Rating' after cleaning and mapping: {missing_values}")

# Optional: If missing values are found, fill them with a default value (like the mean or a placeholder)
df2['Healthiness Rating'] = df2['Healthiness Rating'].fillna(df2['Healthiness Rating'].mean())

# Final check for missing values in the whole dataframe
missing_values_overall = df2.isnull().sum()
print("\nMissing values in the entire DataFrame after cleaning:")
print(missing_values_overall[missing_values_overall > 0])


Missing values in 'Healthiness Rating' after cleaning and mapping: 8

Missing values in the entire DataFrame after cleaning:
Time Started Breakfast         3
Type of Breakfast              3
Calories Consumed (kcal)       7
Type of Exercise              49
Duration of Exercise (min)     1
Time Started Work/Study       22
Time Left Home                 9
dtype: int64


In [None]:
# Fill missing values with the mean of the column
df2['Healthiness Rating'] = df2['Healthiness Rating'].fillna(df2['Healthiness Rating'].mode())

# Final check for missing values in the whole dataframe
missing_values_overall = df2.isnull().sum()
print("\nMissing values in the entire DataFrame after cleaning:")
print(missing_values_overall[missing_values_overall > 0])



Missing values in the entire DataFrame after cleaning:
Time Started Breakfast         3
Type of Breakfast              3
Calories Consumed (kcal)       7
Type of Exercise              49
Duration of Exercise (min)     1
Time Started Work/Study       22
Time Left Home                 9
dtype: int64


In [None]:
# Convert 'Duration of Exercise (min)' to numeric, forcing errors to NaN
df2['Duration of Exercise (min)'] = pd.to_numeric(df2['Duration of Exercise (min)'], errors='coerce')

# Check for NaN values after conversion
print("\nMissing values in 'Duration of Exercise (min)' after conversion:")
print(df2['Duration of Exercise (min)'].isnull().sum())

# Now, fill missing values with the mean
df2['Duration of Exercise (min)'] = df2['Duration of Exercise (min)'].fillna(df2['Duration of Exercise (min)'].mean())

df2['Time Started Breakfast'] = df2['Time Started Breakfast'].fillna('Unknown')
df2['Type of Breakfast'] = df2['Type of Breakfast'].fillna('Unknown')

# Fill missing values in 'Time Started Work/Study' with a placeholder
df2['Time Started Work/Study'] = df2['Time Started Work/Study'].fillna('Unknown')

# Fill missing values in 'Time Left Home' with a placeholder or the mean
df2['Time Left Home'] = df2['Time Left Home'].fillna('Unknown')  # or use mean if applicable

missing_values_df2 = df2.isnull().sum()
print("\nMissing values in Morning Routine DataFrame after cleaning:")
print(missing_values_df2[missing_values_df2 > 0])

# If no missing values, copy the cleaned DataFrame
if missing_values_df2.sum() == 0:
    clean_df2 = df2.copy()
    print("\nMorning Routine DataFrame cleaned and copied successfully.")
else:
    print("\nThere are still missing values in Morning Routine DataFrame.")



Missing values in 'Duration of Exercise (min)' after conversion:
13

Missing values in Morning Routine DataFrame after cleaning:
Calories Consumed (kcal)     7
Type of Exercise            49
dtype: int64

There are still missing values in Morning Routine DataFrame.


In [None]:
# Convert 'Duration of Exercise (min)' to numeric, forcing errors to NaN
df2['Duration of Exercise (min)'] = pd.to_numeric(df2['Duration of Exercise (min)'], errors='coerce')

# Check for NaN values after conversion
print("\nMissing values in 'Duration of Exercise (min)' after conversion:")
print(df2['Duration of Exercise (min)'].isnull().sum())

# Fill missing values in 'Duration of Exercise (min)' with the mean
df2['Duration of Exercise (min)'] = df2['Duration of Exercise (min)'].fillna(df2['Duration of Exercise (min)'].mean())

# Fill missing values in other columns
df2['Time Started Breakfast'] = df2['Time Started Breakfast'].fillna('Unknown')
df2['Type of Breakfast'] = df2['Type of Breakfast'].fillna('Unknown')
df2['Time Started Work/Study'] = df2['Time Started Work/Study'].fillna('Unknown')
df2['Time Left Home'] = df2['Time Left Home'].fillna('Unknown')

# Fill missing values in 'Calories Consumed (kcal)' with the mean
df2['Calories Consumed (kcal)'] = df2['Calories Consumed (kcal)'].fillna(df2['Calories Consumed (kcal)'].mean())

# Fill missing values in 'Type of Exercise' with a placeholder
df2['Type of Exercise'] = df2['Type of Exercise'].fillna('Unknown')

# Check for remaining missing values
missing_values_df2 = df2.isnull().sum()
print("\nMissing values in Morning Routine DataFrame after cleaning:")
print(missing_values_df2[missing_values_df2 > 0])

if missing_values_df2.sum() == 0:
    clean_df2 = df2.copy()
    print("\nMorning Routine DataFrame cleaned and copied successfully.")
else:
    print("\nThere are still missing values in Morning Routine DataFrame")




Missing values in 'Duration of Exercise (min)' after conversion:
0

Missing values in Morning Routine DataFrame after cleaning:
Series([], dtype: int64)

Morning Routine DataFrame cleaned and copied successfully.


In [None]:
print(df3['Stress Level During Commute'].unique())


['high' 'Low' 'medium' 'Moderate' 'High']


In [None]:
def clean_stress_level(stress):
    if isinstance(stress, str):
        stress = stress.strip().lower()  # Normalize to lowercase
        if stress in ['low', 'medium', 'high']:
            return stress.capitalize()  # Return in title case (e.g., 'Low', 'Medium', 'High')
    return pd.NA  # Return NA for non-recognized strings


In [None]:
# Apply the function to the 'Stress Level During Commute' column
df3['Stress Level During Commute'] = df3['Stress Level During Commute'].apply(clean_stress_level)

# Check unique values after cleaning
unique_stress_levels_cleaned = df3['Stress Level During Commute'].unique()
print(unique_stress_levels_cleaned)

['High' 'Low' 'Medium' <NA>]


In [None]:
# Fill missing values for 'Delays Encountered' with 0
df3['Delays Encountered'] = df3['Delays Encountered'].fillna(0)

# Fill missing values for 'Stress Level During Commute' with the mode
stress_mode = df3['Stress Level During Commute'].mode()[0]  # Get the most common value
df3['Stress Level During Commute'] = df3['Stress Level During Commute'].fillna(stress_mode)

# Check for missing values again
print("\nMissing Values in df3 (Commute Patterns) after filling:")
print(df3.isnull().sum())




Missing Values in df3 (Commute Patterns) after filling:
Date                           0
Mode of Transportation         0
Departure Time (am)            0
Arrival Time (am)              0
Duration (min)                 0
Distance Covered (KM)          0
Route Taken                    0
Traffic Condition              0
Weather                        0
Delays Encountered             0
Stress Level During Commute    0
Company During Commute         0
Reason for Commute             0
Commute Start Location         0
Commute End Location           0
dtype: int64


In [None]:
clean_df3 = df3.copy()
print("\nCommute Patterns DataFrame cleaned and copied successfully.")

print("\nMissing Values in clean_df3 (Commute Patterns):")
print(clean_df3.isnull().sum())


Commute Patterns DataFrame cleaned and copied successfully.

Missing Values in clean_df3 (Commute Patterns):
Date                           0
Mode of Transportation         0
Departure Time (am)            0
Arrival Time (am)              0
Duration (min)                 0
Distance Covered (KM)          0
Route Taken                    0
Traffic Condition              0
Weather                        0
Delays Encountered             0
Stress Level During Commute    0
Company During Commute         0
Reason for Commute             0
Commute Start Location         0
Commute End Location           0
dtype: int64


In [None]:
# Check unique values in the 'Duration (min)' column
print(df4['Duration (min)'].unique())

# Check unique values in the 'Duration of Breaks (min)' column
print(df4['Duration of Breaks (min)'].unique())


['2 hours' '3 hours' '4 hours' '5 hours' 120 90 60 150 'All night' 20 10
 45]
[15 0 10 30 25 40 20 50 80 60 5 '10 mins']


In [None]:
# Apply the conversion function
df4['Duration (min)'] = df4['Duration (min)'].apply(convert_to_minutes)
df4['Duration of Breaks (min)'] = df4['Duration of Breaks (min)'].apply(convert_to_minutes)

# Convert the Duration (min) column to integer type, handling NaN values
df4['Duration (min)'] = df4['Duration (min)'].astype('Int64')  # Using 'Int64' to allow NaN values

# Check unique values again
print("Unique values in 'Duration (min)':", df4['Duration (min)'].unique())
print("Unique values in 'Duration of Breaks (min)':", df4['Duration of Breaks (min)'].unique())



Unique values in 'Duration (min)': <IntegerArray>
[120, 180, 240, 300, 90, 60, 150, <NA>, 20, 10, 45]
Length: 11, dtype: Int64
Unique values in 'Duration of Breaks (min)': [15  0 10 30 25 40 20 50 80 60  5]


In [None]:
print("Remaining Missing Values After Conversion:")
print(df4[['Duration (min)', 'Duration of Breaks (min)']].isnull().sum())


Remaining Missing Values After Conversion:
Duration (min)              1
Duration of Breaks (min)    0
dtype: int64


In [None]:
# Check rows with missing values in 'Duration (min)'
missing_duration_rows = df4[df4['Duration (min)'].isnull()]
print(missing_duration_rows)


         Date Start Time  End Time  Duration (min)  Subject  \
20 2024-08-30   22:00 am  06:00:00            <NA>  R, BMAN   

   Type of Study Material Study Location Productivity Level  Number of Breaks  \
20         Lecture slides         School             Medium                 3   

    Duration of Breaks (min) Distractions Tools/Resources Used  \
20                        60        Noise               Laptop   

   Mood Before Study Mood After Study    Goal Achievement  
20           Focused        Satisfied  Coding         Yes  


In [None]:
# Fill the missing duration value based on the calculated duration
df4.loc[df4['Duration (min)'].isnull(), 'Duration (min)'] = 480


In [None]:
unique_subjects = df4['Subject'].unique()
print(unique_subjects)

['business  managemnt' 'r programming ' 'intelligient systems' 'All'
 'BMAN' 'IS' 'R' 'R, BMAN' 'Is' 'CSA' 'AWD']


In [None]:
# Define a mapping for common replacements and corrections
subject_corrections = {
    'business  managemnt': 'Business Management',
    'intelligient systems': 'Intelligent Systems',
    'r programming ': 'R Programming',
    'bman': 'Business Management',
    'r': 'R Programming',
    'is': 'Intelligent Systems',
    'csa': 'C.S.A.',
    'awd': 'Advanced Web Development',
    'r, bman': 'R Programming, Business Management',
    'all': 'All Subjects'
}

In [None]:
# Step 1: Convert the Subject column to lowercase
df4['Subject'] = df4['Subject'].str.lower()

# Step 2: Apply corrections
df4['Subject'] = df4['Subject'].replace(subject_corrections)

# Step 3: Trim whitespace
df4['Subject'] = df4['Subject'].str.strip()

# Optional Step 4: Remove unwanted entries or handle them as needed
# For example, you can drop rows with None or keep them if you prefer
df4 = df4[df4['Subject'].notna()]

# Check unique values after standardization
unique_subjects_standardized = df4['Subject'].unique()
print(unique_subjects_standardized)

['Business Management' 'R Programming' 'Intelligent Systems'
 'All Subjects' 'R Programming, Business Management' 'C.S.A.'
 'Advanced Web Development']


In [None]:
# Normalize the Subject column: lower case and strip whitespace
df4['Subject'] = df4['Subject'].str.lower().str.strip()

# Apply corrections to the Subject column
df4['Subject'] = df4['Subject'].replace(subject_corrections)

# Remove duplicates and get unique values
unique_subjects_standardized = df4['Subject'].unique()

# Convert back to a DataFrame to show unique values
unique_subjects_df = pd.DataFrame(unique_subjects_standardized, columns=['Unique Subjects'])

# Print the DataFrame of unique subjects
print(unique_subjects_df)

                      Unique Subjects
0                 business management
1                       r programming
2                 intelligent systems
3                        all subjects
4  r programming, business management
5                              c.s.a.
6            advanced web development


In [None]:
print("\nMissing Values in df4 (Study Habits) after filling:")
print(df4.isnull().sum())

# If there are still missing values, you can print a message or handle them accordingly.
if df4.isnull().sum().sum() == 0:
    # Create a cleaned copy if no missing values are left
    clean_df4 = df4.copy()
    print("\nStudy Habits DataFrame cleaned and copied successfully.")
else:
    print("\nThere are still missing values in Study Habits DataFrame.")


Missing Values in df4 (Study Habits) after filling:
Date                         0
Start Time                   0
End Time                     0
Duration (min)               0
Subject                      0
Type of Study Material       0
Study Location               0
Productivity Level           0
Number of Breaks             0
Duration of Breaks (min)     0
Distractions                16
Tools/Resources Used         0
Mood Before Study            0
Mood After Study             1
Goal                         0
Achievement                  0
dtype: int64

There are still missing values in Study Habits DataFrame.


In [None]:
df4['Distractions'] = df4['Distractions'].fillna('None')

# Fill missing values for Mood After Study with a placeholder
df4['Mood After Study'] = df4['Mood After Study'].fillna('Not Specified')
print("\nMissing Values in df4 (Study Habits) after filling:")
print(df4.isnull().sum())

if df4.isnull().sum().sum() == 0:
    # Create a cleaned copy if no missing values are left
    clean_df4 = df4.copy()
    print("\nStudy Habits DataFrame cleaned and copied successfully.")
else:
    print("\nThere are still missing values in Study Habits DataFrame.")


Missing Values in df4 (Study Habits) after filling:
Date                        0
Start Time                  0
End Time                    0
Duration (min)              0
Subject                     0
Type of Study Material      0
Study Location              0
Productivity Level          0
Number of Breaks            0
Duration of Breaks (min)    0
Distractions                0
Tools/Resources Used        0
Mood Before Study           0
Mood After Study            0
Goal                        0
Achievement                 0
dtype: int64

Study Habits DataFrame cleaned and copied successfully.


In [None]:
# Check for missing values in each cleaned DataFrame
missing_values_df1 = clean_df1.isnull().sum()
missing_values_df2 = clean_df2.isnull().sum()
missing_values_df3 = clean_df3.isnull().sum()
missing_values_df4 = clean_df4.isnull().sum()

# Print missing values for each DataFrame
print("Missing Values in Cleaned Meal Preferences DataFrame (df1):")
print(missing_values_df1[missing_values_df1 > 0])

print("\nMissing Values in Cleaned Morning Routine DataFrame (df2):")
print(missing_values_df2[missing_values_df2 > 0])

print("\nMissing Values in Cleaned Commute Patterns DataFrame (df3):")
print(missing_values_df3[missing_values_df3 > 0])

print("\nMissing Values in Cleaned Study Habits DataFrame (df4):")
print(missing_values_df4[missing_values_df4 > 0])


Missing Values in Cleaned Meal Preferences DataFrame (df1):
Series([], dtype: int64)

Missing Values in Cleaned Morning Routine DataFrame (df2):
Series([], dtype: int64)

Missing Values in Cleaned Commute Patterns DataFrame (df3):
Series([], dtype: int64)

Missing Values in Cleaned Study Habits DataFrame (df4):
Series([], dtype: int64)


In [None]:
# Print column names for each cleaned DataFrame
print("Column names in Cleaned Meal Preferences DataFrame (df1):")
print(clean_df1.columns.tolist())

print("\nColumn names in Cleaned Morning Routine DataFrame (df2):")
print(clean_df2.columns.tolist())

print("\nColumn names in Cleaned Commute Patterns DataFrame (df3):")
print(clean_df3.columns.tolist())

print("\nColumn names in Cleaned Study Habits DataFrame (df4):")
print(clean_df4.columns.tolist())


Column names in Cleaned Meal Preferences DataFrame (df1):
['Date', 'Time of Meal', 'Type of Meal', 'Main Dish', 'Side Dish(es)', 'Beverage', 'Calories Consumed (kcal)', 'Healthiness Rating', 'Satisfaction Level', 'Mood Before Meal', 'Mood After Meal', 'Location of Meal', 'Reason for Meal Choice', 'Company During Meal']

Column names in Cleaned Morning Routine DataFrame (df2):
['Date', 'Time Woken Up', 'First Activity', 'Duration of First Activity (min)', 'Time Started Breakfast', 'Type of Breakfast', 'Calories Consumed (kcal)', 'Healthiness Rating', 'Exercise Performed', 'Type of Exercise', 'Duration of Exercise (min)', 'Time Started Work/Study', 'Mood After Morning Routine', 'Weather', 'Time Spent Preparing (min)', 'Time Left Home']

Column names in Cleaned Commute Patterns DataFrame (df3):
['Date', 'Mode of Transportation', 'Departure Time (am)', 'Arrival Time (am)', 'Duration (min)', 'Distance Covered (KM)', 'Route Taken', 'Traffic Condition', 'Weather', 'Delays Encountered', 'Stres

In [None]:
# Merging the DataFrames on the 'Date' column
combined_df = clean_df1.merge(clean_df2, on='Date', how='outer', suffixes=('_meal', '_morning'))
combined_df = combined_df.merge(clean_df3, on='Date', how='outer', suffixes=('', '_commute'))
combined_df = combined_df.merge(clean_df4, on='Date', how='outer', suffixes=('', '_study'))


In [None]:
print("\nCombined DataFrame Shape:", combined_df.shape)
print("\nCombined DataFrame Columns:")
print(combined_df.columns.tolist())

# Check for missing values in the combined DataFrame
print("\nMissing Values in Combined DataFrame:")
print(combined_df.isnull().sum())


Combined DataFrame Shape: (1111, 58)

Combined DataFrame Columns:
['Date', 'Time of Meal', 'Type of Meal', 'Main Dish', 'Side Dish(es)', 'Beverage', 'Calories Consumed (kcal)_meal', 'Healthiness Rating_meal', 'Satisfaction Level', 'Mood Before Meal', 'Mood After Meal', 'Location of Meal', 'Reason for Meal Choice', 'Company During Meal', 'Time Woken Up', 'First Activity', 'Duration of First Activity (min)', 'Time Started Breakfast', 'Type of Breakfast', 'Calories Consumed (kcal)_morning', 'Healthiness Rating_morning', 'Exercise Performed', 'Type of Exercise', 'Duration of Exercise (min)', 'Time Started Work/Study', 'Mood After Morning Routine', 'Weather', 'Time Spent Preparing (min)', 'Time Left Home', 'Mode of Transportation', 'Departure Time (am)', 'Arrival Time (am)', 'Duration (min)', 'Distance Covered (KM)', 'Route Taken', 'Traffic Condition', 'Weather_commute', 'Delays Encountered', 'Stress Level During Commute', 'Company During Commute', 'Reason for Commute', 'Commute Start Loca

In [None]:
# Fill missing values in the combined DataFrame
for column in combined_df.columns:
    if combined_df[column].dtype == 'object':  # Categorical columns
        combined_df[column] = combined_df[column].fillna(combined_df[column].mode()[0])
    else:  # Numerical columns
        # Calculate the mean and convert it to an integer if the dtype is Int64
        mean_value = combined_df[column].mean()
        if pd.api.types.is_integer_dtype(combined_df[column]):
            mean_value = int(mean_value)  # Convert mean to integer if the column is integer type
        combined_df[column] = combined_df[column].fillna(mean_value)

# Check for remaining missing values
print("\nMissing Values After Filling:")
print(combined_df.isnull().sum())



Missing Values After Filling:
Date                                0
Time of Meal                        0
Type of Meal                        0
Main Dish                           0
Side Dish(es)                       0
Beverage                            0
Calories Consumed (kcal)_meal       0
Healthiness Rating_meal             0
Satisfaction Level                  0
Mood Before Meal                    0
Mood After Meal                     0
Location of Meal                    0
Reason for Meal Choice              0
Company During Meal                 0
Time Woken Up                       0
First Activity                      0
Duration of First Activity (min)    0
Time Started Breakfast              0
Type of Breakfast                   0
Calories Consumed (kcal)_morning    0
Healthiness Rating_morning          0
Exercise Performed                  0
Type of Exercise                    0
Duration of Exercise (min)          0
Time Started Work/Study             0
Mood After Morning 

In [None]:
# Ensure the '/mnt/data' directory exists (for Colab users)
import os

if not os.path.exists('/mnt/data'):
    os.makedirs('/mnt/data')

# Save the combined DataFrame as an Excel file
combined_df.to_excel('/mnt/data/combined_df.xlsx', index=False)

# Print the file path for reference
print("Excel file saved at: /mnt/data/combined_df.xlsx")

# Download the Excel file
from google.colab import files
files.download('/mnt/data/combined_df.xlsx')



Excel file saved at: /mnt/data/combined_df.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>