In [None]:
import pandas as pd
import re

# Step 1: Load dataset with correct encoding
df = pd.read_csv("C:/Users/bhumi/OneDrive/Desktop/professional/internship/Mobiles_Dataset.csv", encoding='latin1')

# Step 2: Drop exact duplicate rows
df = df.drop_duplicates()

# Step 3: Basic data inspection
print(df.head())
print(df.tail())
print(df.info())
print(df.isnull().sum())

# Step 4: Fill missing Camera values
df['Camera'] = df['Camera'].fillna('Unknown')

# Step 5: Remove non-numeric symbols (e.g., currency symbols) from price columns
df['Actual price'] = df['Actual price'].replace('[^0-9]', '', regex=True)
df['Discount price'] = df['Discount price'].replace('[^0-9]', '', regex=True)

# Step 6: Replace empty strings with NaN (if any price cells were fully non-numeric)
df['Actual price'].replace('', pd.NA, inplace=True)
df['Discount price'].replace('', pd.NA, inplace=True)

# Step 7: Drop rows with missing price values
df.dropna(subset=['Actual price', 'Discount price'], inplace=True)

# Step 8: Convert prices to integer
df['Actual price'] = df['Actual price'].astype(int)
df['Discount price'] = df['Discount price'].astype(int)

#Step 9: Normalize product names by removing color keywords from within brackets
def clean_model_name(name):
    color_words = [
        'black', 'blue', 'white', 'green', 'red', 'purple', 'grey', 'silver',
        'gold', 'pink', 'yellow', 'coral', 'midnight', 'deep', 'starlight',
        'titanium', 'graphite', 'bronze', 'orange', 'cyan'
    ]
    
    name_lower = name.lower()

    # Remove color keywords
    for color in color_words:
        pattern = rf"\b{color}\b"
        name_lower = re.sub(pattern, '', name_lower)

    # Remove extra commas/spaces inside parentheses
    name_lower = re.sub(r'\(\s*,', '(', name_lower)
    name_lower = re.sub(r',\s*\)', ')', name_lower)
    name_lower = re.sub(r'\(\s*\)', '', name_lower)

    # Remove multiple spaces and strip
    name_lower = re.sub(r'\s+', ' ', name_lower).strip()

    return name_lower.title()

# Apply model name cleaner
df['Model Name'] = df['Product Name'].apply(clean_model_name)

# Step 10: Drop duplicate models based on cleaned name
df = df.drop_duplicates(subset='Model Name', keep='first')

# Step 11: Save cleaned dataset
df.to_csv("C:/Users/bhumi/OneDrive/Desktop/professional/internship/mobile_cleaned.csv", index=False)


                         Product Name Actual price Discount price  Stars  \
0     Apple iPhone 15 (Green, 128 GB)     ??79,600       ??65,999    4.6   
1      Apple iPhone 15 (Blue, 128 GB)     ??79,600       ??65,999    4.6   
2     Apple iPhone 15 (Black, 128 GB)     ??79,600       ??65,999    4.6   
3  OnePlus N20 SE (JADE WAVE, 128 GB)     ??19,999       ??11,489    4.0   
4  OnePlus N20 SE (BLUE OASIS, 64 GB)     ??16,999       ??12,999    4.0   

           Rating        Reviews RAM (GB) Storage (GB)  Display Size (inch)  \
0  44,793 Ratings  2,402 Reviews      NIL          128                 6.10   
1  44,793 Ratings  2,402 Reviews      NIL          128                 6.10   
2  44,793 Ratings  2,402 Reviews      NIL          128                 6.10   
3   1,005 Ratings     41 Reviews        4          128                 6.56   
4   1,005 Ratings     41 Reviews        4           64                 6.56   

        Camera                                        Description  \

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Actual price'].replace('', pd.NA, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Discount price'].replace('', pd.NA, inplace=True)
