In [None]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np

# Step 2: Load the dataset
df = pd.read_csv("smartphones.csv")

# Step 3: Explore the dataset
print("---- Basic Info ----")
print(df.info())
print("\n---- Summary Statistics ----")
print(df.describe(include="all"))
print("\n---- First 5 Rows ----")
print(df.head())


In [None]:
# Check missing values
print("\n---- Missing Values ----")
print(df.isnull().sum())

# Strategy:
# - If a column has too many missing values, we may drop it
# - Otherwise, fill missing values (numeric with mean/median, categorical with mode)

# Example: Fill numeric with median, categorical with mode
for col in df.columns:
    if df[col].dtype in ['int64','float64']:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]
print(f"Removed {before - after} duplicate rows")


In [None]:
# Check datatypes
print(df.dtypes)

# Example conversions:
# If 'Price' is object type -> convert to numeric
if 'Price' in df.columns and df['Price'].dtype == 'object':
    df['Price'] = df['Price'].replace('[\$,]', '', regex=True).astype(float)

# If 'Launch Date' column exists -> convert to datetime
if 'Launch Date' in df.columns:
    df['Launch Date'] = pd.to_datetime(df['Launch Date'], errors='coerce')

# Convert categorical columns to category type
for col in df.select_dtypes(include='object').columns:
    df[col] = df[col].astype('category')


In [None]:
print("\n---- Final Dataset Info ----")
print(df.info())
print("\n---- Sample Rows After Cleaning ----")
print(df.head())
