In [None]:
import pandas as pd

data_df_resume = pd.read_csv("csv/resume.csv")
data_df_essay = pd.read_csv("csv/essay.csv")

In [None]:
from nlp.models import model, util


def dedup_and_nona(df, threshold=0.85):
    df["content"] = df["content"].fillna("none")
    embeddings = model.encode(df["content"].fillna("none"), convert_to_tensor=True)
    cos_sim_matrix = util.cos_sim(embeddings, embeddings)

    n = len(df)
    to_drop = set()

    print("\n🔍 Dropped near-duplicate rows:\n")
    for i in range(n):
        if i in to_drop:
            continue
        for j in range(i + 1, n):
            if j not in to_drop:
                sim = cos_sim_matrix[i][j].item()
                if sim > threshold:
                    print(f"---")
                    print(
                        f"✅ Keeping row {i} (original):\n  \"{df.iloc[i]['content']}\""
                    )
                    print(
                        f"❌ Dropping row {j} (similar):\n  \"{df.iloc[j]['content']}\""
                    )
                    print(f"📈 Similarity score: {sim:.4f}")
                    print(f"---\n")
                    to_drop.add(j)

    # Drop duplicates
    deduped_df = df.drop(list(to_drop)).reset_index(drop=True)
    print("✅ Done.")
    return deduped_df

In [None]:
processed_df_resume = dedup_and_nona(data_df_resume)
processed_df_essay = dedup_and_nona(data_df_essay)

In [None]:
def add_unique_ids(df):
    data_df_nona_unique_ids = df.copy()
    data_df_nona_unique_ids["id"] = (
        data_df_nona_unique_ids["id"].astype(str)
        + "_"
        + data_df_nona_unique_ids.index.astype(str)
    )
    return data_df_nona_unique_ids

In [None]:
df_unique_id_rusume = add_unique_ids(processed_df_resume)
df_unique_id_essay = add_unique_ids(processed_df_essay)
resume_data = df_unique_id_rusume.to_dict(orient="records")
essay_data = df_unique_id_essay.to_dict(orient="records")

In [None]:
from aixplain.factories import IndexFactory

# Create an index
resume_index_name = "resume_data_deduped"
resume_index_description = "resumes without duplicates or NA"
resume_index = IndexFactory.create(resume_index_name, resume_index_description)

In [None]:
from aixplain.modules.model.record import Record

# Prepare the records
records = [
    Record(
        value=item["content"],
        value_type="text",
        id=item["id"],
        uri="",
        attributes={"category": item["category"]},
    )
    for item in resume_data
]

# Upsert records to the index
index.upsert(records)

In [None]:
from aixplain.factories import IndexFactory

# Create an index
index_name = "essay_data_deduped"
index_description = "free form without duplicates or NA"
index = IndexFactory.create(index_name, index_description)