# Data Exploration Notebook

This notebook contains exploratory analysis of the various datasets used in the project.
It includes logic from `check_unique_jobs.py` and `inspect_extra_data.py`.

In [None]:
import pandas as pd
import yaml
import os
import sys

# Add project root to path to import src if needed
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

def load_config(config_path="../config/config.yaml"):
    with open(config_path, "r") as f:
        config = yaml.safe_load(f)
    return config

config = load_config()
datasets_config = config['datasets']
print("Config loaded.")

In [None]:
# Job Description Analysis
print("--- Job Description Analysis ---")
try:
    job_path = datasets_config['job_descriptions']
    df_jobs = pd.read_csv(job_path)
    print(f"Total Rows: {len(df_jobs)}")
    print(f"Unique Titles: {df_jobs['Job Title'].nunique()}")
    print("Top 10 Titles:")
    print(df_jobs['Job Title'].value_counts().head(10))
except Exception as e:
    print(f"Error loading job descriptions: {e}")

In [None]:
# Inspect other datasets
targets = ['student_reco', 'indian_colleges', 'world_universities']

for key in targets:
    if key in datasets_config:
        path = datasets_config[key]
        print(f"\n--- Inspecting {key} ---")
        try:
            df = pd.read_csv(path)
            print(f"Columns: {df.columns.tolist()}")
            print(f"First row: {df.iloc[0].to_dict()}")
            display(df.head(3))
        except Exception as e:
            print(f"Error loading {key}: {e}")
    else:
        print(f"Key {key} not found in config.")