# Task
Convert the CSV data from the file 'Kamshad-Kooshan.csv' into a JSON format, where the 'images_clean' column values are transformed into URLs. The desired JSON structure for each record is: `{"id": "...", "cleaned_bio": "...", "images": ["url1", "url2", ...]}`.

## Load the csv data

### Subtask:
Load the CSV file into a pandas DataFrame.


**Reasoning**:
The first step is to load the CSV file into a pandas DataFrame as instructed.



In [None]:
import pandas as pd

df = pd.read_csv('/content/final_peaple_dataframe.csv')

df.shape

(2111, 4)

## Define a function to process each row

### Subtask:
Create a function that takes a row of the DataFrame and transforms it into the desired JSON structure. This function should also handle generating the image URLs.


**Reasoning**:
Define a function to transform a row of the DataFrame into the desired JSON structure.



# Task
Convert a CSV file located at "/Shared drives/shared/biography.csv" to a JSON format, where the image paths in the CSV are converted to Google Drive shareable URLs. The output JSON should have the structure `{"id": "...", "cleaned_bio": "...", "images": ["..."]}` and be saved to "/Shared drives/shared/biography.json".

## Mount google drive

### Subtask:
Mount your Google Drive to access the shared folder.


**Reasoning**:
Mount Google Drive to access the shared folder containing the CSV file.



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# --- Step 2: Install PyDrive ---
!pip install google-api-python-client google-auth-httplib2 google-auth-oauthlib




In [None]:
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build

# Build the Drive API service
drive_service = build('drive', 'v3')


In [None]:
def list_files_recursive(folder_id):
    collected = {}
    page_token = None

    while True:
        result = drive_service.files().list(
            q=f"'{folder_id}' in parents and trashed=false",
            pageSize=1000,
            fields="nextPageToken, files(id, name, mimeType)",
            pageToken=page_token
        ).execute()

        files = result.get('files', [])
        for f in files:
            if f['mimeType'] == 'application/vnd.google-apps.folder':
                collected.update(list_files_recursive(f['id']))  # recurse into subfolder
            else:
                collected[f['name']] = f['id']

        page_token = result.get('nextPageToken')
        if not page_token:
            break

    return collected

# Replace with your "images_clean" folder ID
FOLDER_ID = "1wuCk3Po7E9foHfJJbRYoIxUYWSLpuDDa"

file_id_map = list_files_recursive(FOLDER_ID)
print("‚úÖ Found", len(file_id_map), "files")
print(list(file_id_map.items())[:10])  # preview

‚úÖ Found 6238 files
[('250px-Ahmad_Shamlou_-_2_1.png', '1ZPoYJHr8z2EaRA7oAqndxvhOsNp88zl2'), ('250px-Rahi_moaieri.png', '1dU7a7xLYNPnhSvYYNF7iQrK3kgo-2OXn'), ('250px-Kamal_Khojandi_1.png', '1Nygio1QUx-vkznb5zGiuD7INO4ajxh66'), ('40px-Wikiquote-logo.svg_4.png', '1FsXgw5H7tCmrYdBkh6b1m5i_AydjJIpI'), ('250px-Jami_Rose_Garden.png', '1ouxqmpBIKlMiU21Pzmnr7q8uXgLR2ymk'), ('250px-%D9%85%D9%88%D9%84%D8%A7%D9%86%D8%A7_%D8%A7%D8%AB%D8%B1_%D8%AD%D8%B3%DB%8C%D9%86_%D8%A8%D9%87%D8%B2%D8%A7%D8%AF_%2_1.png', '1bE32Xr0_dOlJ0IH5FJ-z11wwgbE4j-f4'), ('250px-Sadi_in_a_Rose_garden.png', '19_jACZAQZiMJQJ3ulqbdVri7pIPaKrdX'), ('250px-Nizami_Rug_Crop.png', '1TWGGcSM6VwXjCpUpgUj6gcyRswDx7_T0'), ('330px-%D8%AA%D9%86%D8%AF%DB%8C%D8%B3_%D9%88_%D9%85%D9%82%D8%A8%D8%B1%D9%87_%D8%B9%D8%B7%D8%A7%D8%B1_%D9%86%DB%8C%D8%B4%D.png', '1keyyE-sz7hfIP8ohMcW29geMDJ8bUz6-'), ('250px-Khwaja_Abdullah_Ansari_portrait_1.png', '1kuNEccLUaIHaFM-T9ySmhEWPbmlLAgrS')]


In [None]:
import pandas as pd
import json
import requests
from tqdm import tqdm   # progress bar

output = []
valid_count = 0
invalid_count = 0

for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing rows"):
    # Split by "|" to handle multiple images per row
    raw_images = str(row["images_clean"]).split("|")

    image_urls = []
    for raw_img in raw_images:
        filename = raw_img.strip().replace("images_clean\\", "")

        # Find file ID
        file_id = file_id_map.get(filename)
        image_url = f"https://drive.google.com/uc?id={file_id}" if file_id else None

        # Check validity of the URL
        if image_url:
          image_urls.append(image_url)
          valid_count += 1
        else:
            invalid_count += 1

    record = {
        "id": row["name"],
        "cleaned_bio": row["final_paragraph"],
        "images": image_urls  # ‚úÖ could have multiple valid images now
    }
    output.append(record)

# Save JSON
json_path = "output.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(output, f, ensure_ascii=False, indent=2)

print("‚úÖ JSON saved to:", json_path)
print(f"‚úîÔ∏è Valid image URLs: {valid_count}")
print(f"‚ùå Invalid/missing image URLs: {invalid_count}")
print(f"üìä Total rows processed: {len(df)}")


Processing rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2111/2111 [00:00<00:00, 10073.19it/s]

‚úÖ JSON saved to: output.json
‚úîÔ∏è Valid image URLs: 2227
‚ùå Invalid/missing image URLs: 0
üìä Total rows processed: 2111





In [None]:
!pip install --upgrade pip setuptools wheel
!pip install numpy==1.26.4
!pip install hazm==0.7.0
!pip install rapidfuzz
!pip install "nltk==3.6.7"


Collecting hazm==0.7.0
  Using cached hazm-0.7.0-py3-none-any.whl.metadata (3.7 kB)
Collecting nltk==3.3 (from hazm==0.7.0)
  Using cached nltk-3.3.0.zip (1.4 MB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting libwapiti>=0.2.1 (from hazm==0.7.0)
  Using cached libwapiti-0.2.1.tar.gz (233 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading hazm-0.7.0-py3-none-any.whl (316 kB)
Building wheels for collected packages: nltk, libwapiti
[33m  DEPRECATION: Building 'nltk' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'nltk'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Cr

In [None]:
import json
# Load datasets
with open("/content/output.json", "r", encoding="utf-8") as f:
    data1 = json.load(f)

with open("/content/updated_persons.json", "r", encoding="utf-8") as f:
    data2 = json.load(f)


In [None]:

import json
import random
from rapidfuzz import fuzz, process

def normalize_name(name: str) -> str:
    if not isinstance(name, str):
        return ""
    return " ".join(name.strip().split())

# Lookup for dataset2
data2_lookup = {normalize_name(d["id"]): d for d in data2}
merged = []
used_data2 = set()
match_count = 0  # ‚úÖ count matches

for rec1 in data1:
    name1 = normalize_name(rec1["id"])

    if name1 in data2_lookup:
        rec2 = data2_lookup[name1]
        used_data2.add(name1)
        match_count += 1  # ‚úÖ direct match found
    else:
        match = process.extractOne(name1, data2_lookup.keys(), scorer=fuzz.ratio)
        if match and match[1] >= 90:
            rec2 = data2_lookup[match[0]]
            used_data2.add(match[0])
            match_count += 1  # ‚úÖ fuzzy match found
        else:
            rec2 = None

    if rec2:
      if rec1.get("images") and rec2.get("images"):
        chosen_images = random.choice([rec1["images"], rec2["images"]])
      elif rec1.get("images"):
        chosen_images = rec1["images"]
      elif rec2.get("images"):
        chosen_images = rec2["images"]
      else:


      chosen_images = []
      merged.append({
            "id": rec2["id"],  # prefer dataset2 name
            "cleaned_bio": rec2["cleaned_bio"],  # prefer dataset2 bio
            "images": combined_images
        })
    else:
        merged.append(rec1)

# Add remaining records from dataset2
for key, rec2 in data2_lookup.items():
    if key not in used_data2:
        merged.append(rec2)

# Save merged dataset
with open("merged_data.json", "w", encoding="utf-8") as f:
    json.dump(merged, f, ensure_ascii=False, indent=2)

print("‚úÖ Merged dataset saved to merged.json")
print(f"üî¢ Total matches found: {match_count}")
print(f"üìä Final merged dataset length: {len(merged)}")


‚úÖ Merged dataset saved to merged.json
üî¢ Total matches found: 1160
üìä Final merged dataset length: 2467


In [None]:

len(data2)


1549

In [None]:
len(data1)

2111

In [None]:
import json
import csv

# Load the JSON data from the file
json_path = "/content/merged_data.json" # Assuming the merged.json file is the input
with open(json_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)
len(json_data)


2467

In [None]:
import json
import csv


# Define the output CSV file path
csv_path = "/content/output_merged.csv"

# Open the CSV file in write mode
with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
    # Create a CSV writer object
    csv_writer = csv.writer(csvfile)

    # Write the header row
    csv_writer.writerow(["id", "bio", "imag"])

    # Write the data rows
    for record in json_data:
        id_val = record.get("id", "")
        bio_val = record.get("cleaned_bio", "")
        images_val = record.get("images", []) # Get the list of images

        # Check for missing data and report
        missing_info = []
        if not id_val:
            missing_info.append("id")
        if not bio_val:
            missing_info.append("cleaned_bio")
        if not images_val:
            missing_info.append("images")

        if missing_info:
            print(f"‚ùó Warning: Row with id='{id_val}' is missing: {', '.join(missing_info)}")

        # Join the list of images into a string representation suitable for CSV
        imag_val = str(images_val)

        csv_writer.writerow([id_val, bio_val, imag_val])

print(f"‚úÖ JSON data successfully converted to CSV and saved to {csv_path}")

‚úÖ JSON data successfully converted to CSV and saved to /content/output_merged.csv
