In [3]:

import pandas as pd
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# Updated file path
file_path = r"C:\Users\hp\Desktop\BaveshSma\google.xlsx"

try:
    # Load the Excel file into a DataFrame
    df = pd.read_excel(file_path)
    logging.info("Excel file loaded successfully.")

    # Display the first few rows of the original data
    print("\n=== Original Data (First 5 Rows) ===")
    print(df.head())

    # Step 1: Remove duplicate rows
    df_cleaned = df.drop_duplicates()
    logging.info("Duplicate rows removed.")

    # Step 2: Handle missing values (Automatically fill with 'Unknown')
    df_cleaned = df_cleaned.fillna("Unknown")
    logging.info("Filled missing values with 'Unknown'.")

    # Step 3: Standardize column names
    df_cleaned.columns = df_cleaned.columns.str.lower().str.replace(' ', '_', regex=True)
    logging.info("Column names standardized.")

    # Display the cleaned data on screen
    print("\n=== Cleaned Data (First 5 Rows) ===")
    print(df_cleaned.head())

except Exception as e:
    logging.error(f"An error occurred: {e}")


INFO: Excel file loaded successfully.
INFO: Duplicate rows removed.
INFO: Filled missing values with 'Unknown'.
INFO: Column names standardized.



=== Original Data (First 5 Rows) ===
                                          NBa7we src           d4r55  \
0  https://lh3.googleusercontent.com/a-/ALV-UjV2P...        Sia Sahu   
1  https://lh3.googleusercontent.com/a-/ALV-UjW3C...   Girish Kamble   
2  https://lh3.googleusercontent.com/a-/ALV-UjXdU...   Deepak Kapure   
3  https://lh3.googleusercontent.com/a-/ALV-UjURT...       skcharagi   
4  https://lh3.googleusercontent.com/a-/ALV-UjUv6...  Alpha Predator   

                                      RfnDt        rsqaWe  \
0                                 7 reviews  3 months ago   
1    Local Guide · 164 reviews · 857 photos   3 years ago   
2  Local Guide · 323 reviews · 1,729 photos   4 years ago   
3       Local Guide · 46 reviews · 3 photos    a week ago   
4    Local Guide · 150 reviews · 974 photos   5 years ago   

                                              wiI7pd dSlJg Hzvxle  pkWtMe  \
0  Everytime I am visiting it I observe carelessn...        Like     3.0   
1  I got

In [5]:
import pandas as pd

# Load the Excel file
file_path = "./social_media_data_50.xlsx"  # Replace with your actual path
df = pd.read_excel(file_path)

print("\n--- Original Data ---")
print(df.head())

# Step 1: Remove duplicates
df = df.drop_duplicates()

# Step 2: Fill missing values
df = df.fillna("Unknown")

# Step 3: Standardize column names
df.columns = df.columns.str.lower().str.replace(" ", "_")

print("\n--- Cleaned Data ---")
print(df.head())

# Save cleaned data
# df.to_excel("cleaned_social_media_data.xlsx", index=False)
print(df.head)



--- Original Data ---
  Username                               Post  Likes  Comments   Platform
0   user_8  Anyone up for a coding challenge?  945.0      98.0   Facebook
1   user_2            Feeling grateful today.  877.0      83.0  Instagram
2  user_17  Anyone up for a coding challenge?  845.0      19.0   LinkedIn
3  user_14                                NaN  546.0      51.0  Instagram
4   user_3       Can't believe this happened!   43.0      70.0   LinkedIn

--- Cleaned Data ---
  username                               post  likes comments   platform
0   user_8  Anyone up for a coding challenge?  945.0     98.0   Facebook
1   user_2            Feeling grateful today.  877.0     83.0  Instagram
2  user_17  Anyone up for a coding challenge?  845.0     19.0   LinkedIn
3  user_14                            Unknown  546.0     51.0  Instagram
4   user_3       Can't believe this happened!   43.0     70.0   LinkedIn
<bound method NDFrame.head of    username                               p

In [10]:
import pandas as pd

# Load dataset
df = pd.read_excel("./social_media_data_50.xlsx")

# Remove duplicates
df = df.drop_duplicates()

# Standardize column names
df.columns = df.columns.str.lower().str.replace(" ", "_")

# Fill missing values meaningfully
df["username"] = df["username"].fillna("anonymous_user")
df["post"] = df["post"].fillna("No content provided")
df["likes"] = df["likes"].fillna(0).astype(int)
df["comments"] = df["comments"].fillna(0).astype(int)
df["platform"] = df["platform"].fillna("Unknown platform")

# Show cleaned data
print("\n--- Cleaned Data Preview ---")
print(df.head())

# Save cleaned data
df.to_excel("cleaned_social_media_data_50.xlsx", index=False)



--- Cleaned Data Preview ---
         username                               post  likes  comments  \
0          user_8  Anyone up for a coding challenge?    945        98   
1          user_2            Feeling grateful today.      0        83   
2  anonymous_user  Anyone up for a coding challenge?    845        19   
3         user_14                No content provided    546        51   
4          user_3       Can't believe this happened!     43        70   

           platform  
0          Facebook  
1         Instagram  
2  Unknown platform  
3         Instagram  
4          LinkedIn  
