In [1]:
#Data

In [16]:
import pandas as pd
import numpy as np

# Create a messy dataset example
messy_data = {
    'customer_id': [1, 2, 3, 4, 5, 1, 6, 7, 8, 9],
    'name': [
        'John Smith', 'Jane Doe', 'Bob Johnson', 'Alice Brown',
        'CHARLIE WILSON', 'John Smith', 'David Lee', 'emily davis',
        'Frank Miller', 'Grace Wilson'
    ],
    'email': [
        'john@email.com', 'jane.doe@company.com', 'bob@email.com',
        None, 'charlie@email.com', 'john@email.com', 'david@email.com',
        'emily.davis@company.com', 'frank@email.com', 'grace@email.com'
    ],
    'age': [32, 28, 45, 29, 52, 32, 38, 25, 41, 150],
    'city': [
        'New York', 'Los Angeles', 'Chicago', 'Houston',
        'NEW YORK', 'New York', 'Phoenix', 'Chicago',
        'Philadelphia', 'Miami'
    ],
    'salary': [50000, 75000, 60000, 55000, 80000, 50000, 70000, 65000, 72000, 68000],
    'signup_date': [
        '2022-01-15', '2021-03-22', '2023-05-10', '2022-11-30',
        '2021-07-18', '2022-01-15', '2023-02-14', '2022-09-05',
        '2021-12-20', '2024-01-01'  # Note: last one is a future date
    ],
    'phone': [
        '(555) 123-4567', '555-987-6543', '555.456.7890',
        '(555) 234-5678', '555-345-6789', '(555) 123-4567',
        '(555) 876-5432', '555-765-4321', '(555) 543-2109', 'invalid-phone'
    ]
}

# Create DataFrame
messy_df = pd.DataFrame(messy_data)

# Display the messy dataset
print("Messy Dataset:")
print(messy_df)


Messy Dataset:
   customer_id            name                    email  age          city  \
0            1      John Smith           john@email.com   32      New York   
1            2        Jane Doe     jane.doe@company.com   28   Los Angeles   
2            3     Bob Johnson            bob@email.com   45       Chicago   
3            4     Alice Brown                     None   29       Houston   
4            5  CHARLIE WILSON        charlie@email.com   52      NEW YORK   
5            1      John Smith           john@email.com   32      New York   
6            6       David Lee          david@email.com   38       Phoenix   
7            7     emily davis  emily.davis@company.com   25       Chicago   
8            8    Frank Miller          frank@email.com   41  Philadelphia   
9            9    Grace Wilson          grace@email.com  150         Miami   

   salary signup_date           phone  
0   50000  2022-01-15  (555) 123-4567  
1   75000  2021-03-22    555-987-6543  
2   60

In [10]:
#Missing Values

In [11]:
# Check for missing values
print("Missing Values:")
print(messy_df.isnull().sum())
print("\n")

# Percentage of missing values
print("Missing Values Percentage:")
print((messy_df.isnull().sum() / len(messy_df)) * 100)


Missing Values:
customer_id    0
name           0
email          1
age            0
city           0
salary         0
signup_date    0
phone          0
dtype: int64


Missing Values Percentage:
customer_id     0.0
name            0.0
email          10.0
age             0.0
city            0.0
salary          0.0
signup_date     0.0
phone           0.0
dtype: float64


In [None]:
#Check for duplicates

In [12]:
# Check for duplicate rows
print("Duplicate Rows:")
print(messy_df.duplicated().sum())
print("\n")

# Check for duplicates based on specific columns (customer_id)
print("Duplicate customer_id values:")
print(messy_df['customer_id'].duplicated().sum())


Duplicate Rows:
1


Duplicate customer_id values:
1


In [None]:
#Inconsistent Text Formatting

In [13]:
# Check for inconsistent casing in 'name' and 'city' columns
print("Unique name formats (first 10):")
print(messy_df['name'].unique()[:10])
print("\n")

print("Unique city formats:")
print(messy_df['city'].unique())


Unique name formats (first 10):
['John Smith' 'Jane Doe' 'Bob Johnson' 'Alice Brown' 'CHARLIE WILSON'
 'David Lee' 'emily davis' 'Frank Miller' 'Grace Wilson']


Unique city formats:
['New York' 'Los Angeles' 'Chicago' 'Houston' 'NEW YORK' 'Phoenix'
 'Philadelphia' 'Miami']


In [None]:
#Invalid Values

In [14]:
# Check for invalid ages (outside reasonable range)
print("Age statistics:")
print(messy_df['age'].describe())
print("\n")

# Identify invalid ages
invalid_ages = messy_df[(messy_df['age'] < 18) | (messy_df['age'] > 100)]
print("Invalid ages:")
print(invalid_ages[['name', 'age']])


Age statistics:
count     10.000000
mean      47.200000
std       37.078895
min       25.000000
25%       29.750000
50%       35.000000
75%       44.000000
max      150.000000
Name: age, dtype: float64


Invalid ages:
           name  age
9  Grace Wilson  150


In [17]:
#Feature dates

In [19]:
# Convert to datetime and check for future dates
messy_df['signup_date'] = pd.to_datetime(messy_df['signup_date'])

# Identify future dates (after current date)
future_dates = messy_df[messy_df['signup_date'] > pd.Timestamp.now()]

print("Future signup dates:")
print(future_dates[['name', 'signup_date']])


Future signup dates:
Empty DataFrame
Columns: [name, signup_date]
Index: []


In [20]:
#Data profelling 

In [24]:
def basic_data_profile(df):
    """Generate a basic data profile to identify quality issues"""
    profile = pd.DataFrame({
        'data_type': df.dtypes,
        'missing_count': df.isnull().sum(),
        'missing_percentage': (df.isnull().sum() / len(df)) * 100,
        'unique_values': df.nunique(),
        'sample_values': [df[col].dropna().unique()[:3] for col in df.columns]
    })
    return profile


# Generate and display the profile
print("Data Profile:")
print(basic_data_profile(messy_df))


Data Profile:
                  data_type  missing_count  missing_percentage  unique_values  \
customer_id           int64              0                 0.0              9   
name                 object              0                 0.0              9   
email                object              1                10.0              8   
age                   int64              0                 0.0              9   
city                 object              0                 0.0              8   
salary                int64              0                 0.0              9   
signup_date  datetime64[ns]              0                 0.0              9   
phone                object              0                 0.0              9   

                                                 sample_values  
customer_id                                          [1, 2, 3]  
name                       [John Smith, Jane Doe, Bob Johnson]  
email        [john@email.com, jane.doe@company.com, bob@ema..

In [27]:
import pandas as pd
import numpy as np

# ---- 1. Fix missing values ----
# Fill missing emails with placeholder
messy_df['email'] = messy_df['email'].fillna('unknown@email.com')

# ---- 2. Remove duplicate rows ----
messy_df = messy_df.drop_duplicates()

# ---- 3. Normalize casing ----
messy_df['name'] = messy_df['name'].str.title()   # e.g., "CHARLIE WILSON" → "Charlie Wilson"
messy_df['city'] = messy_df['city'].str.title()   # e.g., "NEW YORK" → "New York"

# ---- 4. Handle invalid ages ----
median_age = messy_df[(messy_df['age'] >= 18) & (messy_df['age'] <= 100)]['age'].median()
messy_df.loc[(messy_df['age'] < 18) | (messy_df['age'] > 100), 'age'] = median_age

# ---- 5. Handle future signup dates ----
messy_df['signup_date'] = pd.to_datetime(messy_df['signup_date'])
messy_df.loc[messy_df['signup_date'] > pd.Timestamp.now(), 'signup_date'] = pd.Timestamp.now()

# ---- 6. Clean phone numbers (keep only digits) ----
messy_df['phone'] = messy_df['phone'].str.replace(r'\D', '', regex=True)
messy_df['phone'] = messy_df['phone'].apply(lambda x: x if len(x) == 10 else np.nan)  # invalid if not 10 digits

# ---- Final Cleaned Dataset ----
print("Cleaned Dataset:")
print(messy_df)


Cleaned Dataset:
   customer_id            name                    email  age          city  \
0            1      John Smith           john@email.com   32      New York   
1            2        Jane Doe     jane.doe@company.com   28   Los Angeles   
2            3     Bob Johnson            bob@email.com   45       Chicago   
3            4     Alice Brown        unknown@email.com   29       Houston   
4            5  Charlie Wilson        charlie@email.com   52      New York   
6            6       David Lee          david@email.com   38       Phoenix   
7            7     Emily Davis  emily.davis@company.com   25       Chicago   
8            8    Frank Miller          frank@email.com   41  Philadelphia   
9            9    Grace Wilson          grace@email.com   35         Miami   

   salary signup_date       phone  
0   50000  2022-01-15  5551234567  
1   75000  2021-03-22  5559876543  
2   60000  2023-05-10  5554567890  
3   55000  2022-11-30  5552345678  
4   80000  2021-07-18 

In [28]:
# ---- 1. Missing values and percentages ----
missing_counts = messy_df.isnull().sum()
missing_percent = (missing_counts / len(messy_df)) * 100
print("Missing Values:\n", missing_counts)
print("\nMissing Values Percentage:\n", missing_percent)
print("\n" + "-"*50)

# ---- 2. Duplicate customer_id check ----
duplicate_customer_ids = messy_df['customer_id'][messy_df['customer_id'].duplicated()]
print("Duplicate customer_id values:")
print(duplicate_customer_ids)
print("\n" + "-"*50)

# ---- 3. Invalid ages check ----
invalid_ages = messy_df[(messy_df['age'] < 18) | (messy_df['age'] > 100)]
print("Invalid ages (if any):")
print(invalid_ages[['name', 'age']])
print("\n" + "-"*50)

# ---- 4. Future signup dates check ----
future_dates = messy_df[messy_df['signup_date'] > pd.Timestamp.now()]
print("Future signup dates (if any):")
print(future_dates[['name', 'signup_date']])
print("\n" + "-"*50)

# ---- 5. Invalid phone numbers (NaN after cleaning) ----
invalid_phones = messy_df[messy_df['phone'].isnull()]
print("Invalid phone numbers (if any):")
print(invalid_phones[['name', 'phone']])


Missing Values:
 customer_id    0
name           0
email          0
age            0
city           0
salary         0
signup_date    0
phone          1
dtype: int64

Missing Values Percentage:
 customer_id     0.000000
name            0.000000
email           0.000000
age             0.000000
city            0.000000
salary          0.000000
signup_date     0.000000
phone          11.111111
dtype: float64

--------------------------------------------------
Duplicate customer_id values:
Series([], Name: customer_id, dtype: int64)

--------------------------------------------------
Invalid ages (if any):
Empty DataFrame
Columns: [name, age]
Index: []

--------------------------------------------------
Future signup dates (if any):
Empty DataFrame
Columns: [name, signup_date]
Index: []

--------------------------------------------------
Invalid phone numbers (if any):
           name phone
9  Grace Wilson   NaN


In [33]:
from IPython.display import FileLink

# Create a link to download the file
FileLink('cleaned_dataset.csv')
