In [2]:
#  TASK 1: Data Redundancy Removal System

import pandas as pd

# -------------------------------
# STEP 1: Load Dataset (simulate redundancy)
# -------------------------------
data = {
    "id": [1, 2, 3, 4, 2, 3],
    "name": ["Alice", "Bob", "Charlie", "David", "Bob", "Charlie"],
    "email": [
        "alice@mail.com",
        "bob@mail.com",
        "charlie@mail.com",
        "david@mail.com",
        "bob@mail.com",
        "charlie@mail.com"
    ]
}

df = pd.DataFrame(data)

print(" Original Data (with duplicates):")
print(df)

# -------------------------------
# STEP 2: Identify Duplicates
# -------------------------------
duplicates = df[df.duplicated(subset=["name", "email"], keep=False)]
print("\n Duplicates Found:")
print(duplicates)

# -------------------------------
# STEP 3: Remove Redundancy
# -------------------------------
unique_df = df.drop_duplicates(subset=["name", "email"], keep="first")
print("\n Cleaned Data (Only Unique Records):")
print(unique_df)

# -------------------------------
# STEP 4: Validation Before Adding New Data
# -------------------------------
# Suppose new incoming data
new_data = pd.DataFrame([
    {"id": 5, "name": "Eve", "email": "eve@mail.com"},          # ✅ Unique
    {"id": 6, "name": "Bob", "email": "bob@mail.com"},          # ❌ Duplicate
    {"id": 7, "name": "Frank", "email": "frank@mail.com"}       # ✅ Unique
])

print("\n New Incoming Data:")
print(new_data)

# Validation: Add only if not already present
validated_data = new_data[
    ~new_data.set_index(["name", "email"]).index.isin(unique_df.set_index(["name", "email"]).index)
]

print("\n Data After Validation (only unique kept):")
print(validated_data)

# -------------------------------
# STEP 5: Append Unique Data into Final Database
# -------------------------------
final_df = pd.concat([unique_df, validated_data]).reset_index(drop=True)

print("\n Final Database (with no redundancy):")
print(final_df)

# -------------------------------
# STEP 6: Save to CSV (simulate cloud storage)
# -------------------------------
final_df.to_csv("cleaned_database.csv", index=False)
print("\n Data saved to cleaned_database.csv")


 Original Data (with duplicates):
   id     name             email
0   1    Alice    alice@mail.com
1   2      Bob      bob@mail.com
2   3  Charlie  charlie@mail.com
3   4    David    david@mail.com
4   2      Bob      bob@mail.com
5   3  Charlie  charlie@mail.com

 Duplicates Found:
   id     name             email
1   2      Bob      bob@mail.com
2   3  Charlie  charlie@mail.com
4   2      Bob      bob@mail.com
5   3  Charlie  charlie@mail.com

 Cleaned Data (Only Unique Records):
   id     name             email
0   1    Alice    alice@mail.com
1   2      Bob      bob@mail.com
2   3  Charlie  charlie@mail.com
3   4    David    david@mail.com

 New Incoming Data:
   id   name           email
0   5    Eve    eve@mail.com
1   6    Bob    bob@mail.com
2   7  Frank  frank@mail.com

 Data After Validation (only unique kept):
   id   name           email
0   5    Eve    eve@mail.com
2   7  Frank  frank@mail.com

 Final Database (with no redundancy):
   id     name             email
0   1  