# Cleaning and Structuring the Data

## Task 1: Identify Issues in the Data

In [18]:
print(open("data1.json").read())


{
    "users": [
        {"id": 1, "name": "Amit", "friends": [2, 3], "liked_pages": [101]},
        {"id": 2, "name": "Priya", "friends": [1, 4], "liked_pages": [102]},
        {"id": 3, "name": "", "friends": [1], "liked_pages": [101, 103]},
        {"id": 4, "name": "Sara", "friends": [2, 2], "liked_pages": [104]},
        {"id": 5, "name": "Amit", "friends": [], "liked_pages": []}
    ],
    "pages": [
        {"id": 101, "name": "Python Developers"},
        {"id": 102, "name": "Data Science Enthusiasts"},
        {"id": 103, "name": "AI & ML Community"},
        {"id": 104, "name": "Web Dev Hub"},
        {"id": 104, "name": "Web Development"}
    ]
}


### Problems:
- User ID 3 has an empty name.
- User ID 4 has a duplicate friend entry.
- User ID 5 has no connections or liked pages (inactive user).
- The pages list contains duplicate page IDs.

## Task 2: Clean the Data
- Remove users with missing names.
- Remove duplicate friend entries.
- Remove inactive users (users with no friends and no liked pages).
- Deduplicate pages based on IDs.

# Actual working on raw data file :-

### Step 0: For Json file working lib use

In [27]:
import json
data = "data1.json"

In [28]:
type (data)

str

### Step 1: Funtion for load the external json data (str into dict)

In [29]:
def load_data (filename):
    with open (filename, "r") as f:
        data = json.load (f)
    return data

In [30]:
load_data("data1.json")

{'users': [{'id': 1, 'name': 'Amit', 'friends': [2, 3], 'liked_pages': [101]},
  {'id': 2, 'name': 'Priya', 'friends': [1, 4], 'liked_pages': [102]},
  {'id': 3, 'name': '', 'friends': [1], 'liked_pages': [101, 103]},
  {'id': 4, 'name': 'Sara', 'friends': [2, 2], 'liked_pages': [104]},
  {'id': 5, 'name': 'Amit', 'friends': [], 'liked_pages': []}],
 'pages': [{'id': 101, 'name': 'Python Developers'},
  {'id': 102, 'name': 'Data Science Enthusiasts'},
  {'id': 103, 'name': 'AI & ML Community'},
  {'id': 104, 'name': 'Web Dev Hub'},
  {'id': 104, 'name': 'Web Development'}]}

In [31]:
type (data)

str

### Step 2: Remove users with missing names

In [24]:
def clean_data(data):

    # Remove users with missing names
    clean_user = []
    for user in data["users"]:
      if user.get("name","").strip():
        clean_user.append(user)
      data["user"]= clean_user

    # Remove duplicate friends
    for user in data["users"]:
        user["friends"] = list(set(user["friends"]))

    # Remove inactive users
    data["users"] = [user for user in data["users"] if user["friends"] or user["liked_pages"]]

    # Remove duplicate pages
    unique_pages = {}
    for page in data["pages"]:
        unique_pages[page["id"]] = page
    data["pages"] = list(unique_pages.values())

change in str to dic

In [25]:
data = clean_data(data)
json.dump(data, open("cleaned_codebook_data.json", "w"), indent=4)
print("Data cleaned successfully!")

TypeError: string indices must be integers, not 'str'

In [13]:
print(open("data1.json").read())


{
    "users": [
        {"id": 1, "name": "Amit", "friends": [2, 3], "liked_pages": [101]},
        {"id": 2, "name": "Priya", "friends": [1, 4], "liked_pages": [102]},
        {"id": 3, "name": "", "friends": [1], "liked_pages": [101, 103]},
        {"id": 4, "name": "Sara", "friends": [2, 2], "liked_pages": [104]},
        {"id": 5, "name": "Amit", "friends": [], "liked_pages": []}
    ],
    "pages": [
        {"id": 101, "name": "Python Developers"},
        {"id": 102, "name": "Data Science Enthusiasts"},
        {"id": 103, "name": "AI & ML Community"},
        {"id": 104, "name": "Web Dev Hub"},
        {"id": 104, "name": "Web Development"}
    ]
}
