<a href="https://colab.research.google.com/github/2303a51885/B2_PFDS_1885/blob/main/PFDS_LAB06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np

# Step 1: Create a raw Employee dataset with issues
data = {
    'Emp_ID': [101, 102, 103, 104, 105, 102, 106],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Bob', None],
    'Department': ['HR', 'IT', 'Finance', None, 'IT', 'IT', 'Finance'],
    'Salary': ['50000', '60000', None, '70000', 'abc', '60000', '55000'],  # salary issues
    'Joining_Date': ['2022-01-15', '2021-07-20', '2023-03-10', 'Invalid', '2020-11-05', '2021-07-20', '2022-08-25']
}

df = pd.DataFrame(data)

print("Original Raw Dataset:\n", df, "\n")

# -------------------------------------------------------
# Step 2: Drop duplicate rows
df = df.drop_duplicates()
print("After Removing Duplicates:\n", df, "\n")

# -------------------------------------------------------
# Step 3: Handle missing values
# Fill missing 'Name' with 'Unknown'
df['Name'] = df['Name'].fillna('Unknown')

# Fill missing Department with mode (most frequent)
df['Department'] = df['Department'].fillna(df['Department'].mode()[0])

# -------------------------------------------------------
# Step 4: Fix Salary column (convert to numeric, coerce errors to NaN)
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')

# Replace missing/invalid Salary with median
df['Salary'] = df['Salary'].fillna(df['Salary'].median())

# -------------------------------------------------------
# Step 5: Fix Joining_Date column (convert to datetime, handle errors)
df['Joining_Date'] = pd.to_datetime(df['Joining_Date'], errors='coerce')

# Fill invalid/missing dates with today's date
df['Joining_Date'] = df['Joining_Date'].fillna(pd.Timestamp.today())

# -------------------------------------------------------
# Step 6: Rename columns for consistency
df.rename(columns={
    'Emp_ID': 'EmployeeID',
    'Name': 'EmployeeName',
    'Department': 'Dept',
    'Salary': 'Salary',
    'Joining_Date': 'JoiningDate'
}, inplace=True)

# -------------------------------------------------------
# Final Cleaned Dataset
print("Final Cleaned Employee Dataset:\n", df)
print("\nData Types:\n", df.dtypes)


Original Raw Dataset:
    Emp_ID     Name Department Salary Joining_Date
0     101    Alice         HR  50000   2022-01-15
1     102      Bob         IT  60000   2021-07-20
2     103  Charlie    Finance   None   2023-03-10
3     104    David       None  70000      Invalid
4     105      Eva         IT    abc   2020-11-05
5     102      Bob         IT  60000   2021-07-20
6     106     None    Finance  55000   2022-08-25 

After Removing Duplicates:
    Emp_ID     Name Department Salary Joining_Date
0     101    Alice         HR  50000   2022-01-15
1     102      Bob         IT  60000   2021-07-20
2     103  Charlie    Finance   None   2023-03-10
3     104    David       None  70000      Invalid
4     105      Eva         IT    abc   2020-11-05
6     106     None    Finance  55000   2022-08-25 

Final Cleaned Employee Dataset:
    EmployeeID EmployeeName     Dept   Salary                JoiningDate
0         101        Alice       HR  50000.0 2022-01-15 00:00:00.000000
1         102     

In [1]:
import pandas as pd

# Step 1: Create a Sales dataset with duplicate transaction entries
data = {
    'TransactionID': [101, 102, 103, 104, 105, 102, 104],  # duplicate IDs for 102 & 104
    'Customer': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Bob', 'David'],
    'Product': ['Laptop', 'Mobile', 'Tablet', 'Laptop', 'Mobile', 'Mobile', 'Laptop'],
    'Amount': [800, 500, 300, 800, 500, 500, 800],
    'Date': ['2025-09-01', '2025-09-02', '2025-09-03', '2025-09-04', '2025-09-05',
             '2025-09-02', '2025-09-04']
}

sales_df = pd.DataFrame(data)

print("Original Sales Dataset:")
print(sales_df)

# Step 2: Identify duplicates
duplicates = sales_df[sales_df.duplicated()]
print("\nDuplicate Records:")
print(duplicates)

# Step 3: Remove duplicates
cleaned_sales = sales_df.drop_duplicates()

print("\nCleaned Sales Dataset (Duplicates Removed):")
print(cleaned_sales)


Original Sales Dataset:
   TransactionID Customer Product  Amount        Date
0            101    Alice  Laptop     800  2025-09-01
1            102      Bob  Mobile     500  2025-09-02
2            103  Charlie  Tablet     300  2025-09-03
3            104    David  Laptop     800  2025-09-04
4            105      Eva  Mobile     500  2025-09-05
5            102      Bob  Mobile     500  2025-09-02
6            104    David  Laptop     800  2025-09-04

Duplicate Records:
   TransactionID Customer Product  Amount        Date
5            102      Bob  Mobile     500  2025-09-02
6            104    David  Laptop     800  2025-09-04

Cleaned Sales Dataset (Duplicates Removed):
   TransactionID Customer Product  Amount        Date
0            101    Alice  Laptop     800  2025-09-01
1            102      Bob  Mobile     500  2025-09-02
2            103  Charlie  Tablet     300  2025-09-03
3            104    David  Laptop     800  2025-09-04
4            105      Eva  Mobile     500  2025

In [2]:
import pandas as pd

# Step 1: Create Employee dataset with incorrect data types
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Salary': ['50000', '60000', '55000', '70000', '65000'],  # stored as strings
    'Joining_Date': ['2022-01-15', '2021-07-20', '2023-03-10', '2020-11-05', '2022-08-25']  # string format
}

employee_df = pd.DataFrame(data)

print("Original Employee Dataset:")
print(employee_df.dtypes)
print(employee_df)

# Step 2: Convert Salary to numeric
employee_df['Salary'] = pd.to_numeric(employee_df['Salary'])

# Step 3: Convert Joining_Date to datetime
employee_df['Joining_Date'] = pd.to_datetime(employee_df['Joining_Date'])

print("\nCleaned Employee Dataset with Correct Data Types:")
print(employee_df.dtypes)
print(employee_df)


Original Employee Dataset:
EmployeeID       int64
Name            object
Salary          object
Joining_Date    object
dtype: object
   EmployeeID     Name Salary Joining_Date
0           1    Alice  50000   2022-01-15
1           2      Bob  60000   2021-07-20
2           3  Charlie  55000   2023-03-10
3           4    David  70000   2020-11-05
4           5      Eva  65000   2022-08-25

Cleaned Employee Dataset with Correct Data Types:
EmployeeID               int64
Name                    object
Salary                   int64
Joining_Date    datetime64[ns]
dtype: object
   EmployeeID     Name  Salary Joining_Date
0           1    Alice   50000   2022-01-15
1           2      Bob   60000   2021-07-20
2           3  Charlie   55000   2023-03-10
3           4    David   70000   2020-11-05
4           5      Eva   65000   2022-08-25
