# Learning Outcomes for this Module on Big Data (6CS030):
#### LO1 – Apply appropriate theory, tools and techniques to problems associated with big data.
#### LO2 – Synthesise solutions to problems from the big data domain.
#### LO3 – Analyse and evaluate solutions to big data problems.
#### LO4 – Present results of solutions using appropriate methods.

# Gathering dataset sources
- Datasetlist - www.datasetlist.com
- Kaggle - www.kaggle.com

## Get list of datasets from datasetlist.com


In [3]:
import json
import csv
import re

# Load the JavaScript file
file_path = "./source/datasets.js"

with open(file_path, "r", encoding="utf-8") as file:
    js_content = file.read()

# Extract JSON data using regex
match = re.search(r"var datasetList = (\[.*?\]);", js_content, re.DOTALL)
if match:
    dataset_json = match.group(1)  # Extract JSON part
    dataset_list = json.loads(dataset_json)  # Convert JSON string to Python list

    # Define CSV output file
    csv_filename = "./csv_files/dataset_list.csv"

    # Open CSV file and write data
    with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        
        # Write CSV header
        writer.writerow(["ID", "Category", "Name", "Description", "Year", "Link", "License", "PDF"])
        
        # Write dataset rows
        for d in dataset_list:
            writer.writerow([
                d["id"],
                d["type"],  # Category
                d["name"],
                d["description"],
                d["year"],
                d["link"],
                d["license"],
            # Replace empty pdf links, if not available
                d["pdf"] if d.get("pdf") else "N/A"  
            ])

    print(f"✅ Successfully saved {len(dataset_list)} datasets to {csv_filename}")

else:
    print("❌ Dataset list not found in the JavaScript file.")


✅ Successfully saved 295 datasets to ./csv_files/dataset_list.csv


## Preprocess dataset list from Kaggle

### 1. Computer vision

In [1]:
import json
import csv
import re

# Load Kaggle JSON response
file_path = "./source/kaggle_cv_datasets.json"  # Update if needed

with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)  # Load JSON

# Extract dataset list
dataset_list = data.get("datasetList", {}).get("items", [])

# Define CSV output file
csv_filename = "./csv_files/kaggle_cv_datasets.csv"

# Open CSV file and write data
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write CSV header
    writer.writerow(["ID", "Category", "Name", "Description", "Year", "Link", "License", "PDF"])

    # Write dataset rows
    for d in dataset_list:
        dataset_id = d.get("voteButton", {}).get("datasetId", "N/A")  # Extract dataset ID
        category = "cv"  # Set category to "cv"
        name = d.get("datasource", {}).get("title", "N/A")  # Dataset Name
        description = d.get("datasource", {}).get("overview", "N/A")  # Description
        
        # Extract Year from `dateUpdated`
        year_match = re.search(r"(\d{4})", d.get("dateUpdated", ""))
        year = year_match.group(1) if year_match else "N/A"
        
        # Construct Paper Link
        paper_link = f"https://www.kaggle.com{d.get('datasetUrl', '')}"

        license_name = d.get("licenseName", "N/A")  # License
        
        # PDF Link set to "see paper link"
        pdf_link = "Kaggle"

        # Write row to CSV
        writer.writerow([dataset_id, category, name, description, year, paper_link, license_name, pdf_link])

print(f"✅ Successfully saved {len(dataset_list)} Kaggle datasets to {csv_filename}")


✅ Successfully saved 60 Kaggle datasets to ./csv_files/kaggle_cv_datasets.csv


### 2. Natural Language Processing (NLP)

In [3]:
# Load Kaggle JSON response
file_path = "./source/kaggle_nlp_datasets.json"  # Update if needed

with open(file_path, "r", encoding="utf-8") as file:
    data = json.load(file)  # Load JSON

# Extract dataset list
dataset_list = data.get("datasetList", {}).get("items", [])

# Define CSV output file
csv_filename = "./csv_files/kaggle_nlp_datasets.csv"

# Open CSV file and write data
with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)

    # Write CSV header
    writer.writerow(["ID", "Category", "Name", "Description", "Year", "Link", "License", "PDF"])

    # Write dataset rows
    for d in dataset_list:
        dataset_id = d.get("voteButton", {}).get("datasetId", "N/A")  # Extract dataset ID
        category = "nlp"  # Set category to "nlp"
        name = d.get("datasource", {}).get("title", "N/A")  # Dataset Name
        description = d.get("datasource", {}).get("overview", "N/A")  # Description
        
        # Extract Year from `dateUpdated`
        year_match = re.search(r"(\d{4})", d.get("dateUpdated", ""))
        year = year_match.group(1) if year_match else "N/A"
        
        # Construct Paper Link
        paper_link = f"https://www.kaggle.com{d.get('datasetUrl', '')}"

        license_name = d.get("licenseName", "N/A")  # License
        
        # PDF Link set to "see paper link"
        pdf_link = "Kaggle"

        # Write row to CSV
        writer.writerow([dataset_id, category, name, description, year, paper_link, license_name, pdf_link])

print(f"✅ Successfully saved {len(dataset_list)} Kaggle datasets to {csv_filename}")


✅ Successfully saved 80 Kaggle datasets to ./csv_files/kaggle_nlp_datasets.csv


## Next steps
On Elasticsearch:
- Imlpement semantic search on the extracted dataset
- Test the functionality
- Implement features (ranking, suggestions)
- Add more data to the index (DB)