In [2]:
import pandas as pd

data_df_resume = pd.read_csv("csv/resumes.csv")
data_df_essay = pd.read_csv("csv/essays.csv")

In [3]:
from nlp.models import model, util


def dedup_and_nona(df, threshold=0.85):
    df["content"] = df["content"].fillna("none")
    embeddings = model.encode(df["content"].fillna("none"), convert_to_tensor=True)
    cos_sim_matrix = util.cos_sim(embeddings, embeddings)

    n = len(df)
    to_drop = set()

    print("\n🔍 Dropped near-duplicate rows:\n")
    for i in range(n):
        if i in to_drop:
            continue
        for j in range(i + 1, n):
            if j not in to_drop:
                sim = cos_sim_matrix[i][j].item()
                if sim > threshold:
                    print(f"---")
                    print(
                        f"✅ Keeping row {i} (original):\n  \"{df.iloc[i]['content']}\""
                    )
                    print(
                        f"❌ Dropping row {j} (similar):\n  \"{df.iloc[j]['content']}\""
                    )
                    print(f"📈 Similarity score: {sim:.4f}")
                    print(f"---\n")
                    to_drop.add(j)

    # Drop duplicates
    deduped_df = df.drop(list(to_drop)).reset_index(drop=True)
    print("✅ Done.")
    return deduped_df

In [4]:
processed_df_resume = dedup_and_nona(data_df_resume)
processed_df_essay = dedup_and_nona(data_df_essay)


🔍 Dropped near-duplicate rows:

---
✅ Keeping row 0 (original):
  "Rami I. Ibrahimi"
❌ Dropping row 8 (similar):
  "Rami I. Ibrahimi"
📈 Similarity score: 1.0000
---

---
✅ Keeping row 0 (original):
  "Rami I. Ibrahimi"
❌ Dropping row 16 (similar):
  "RAMI I. IBRAHIMI"
📈 Similarity score: 1.0000
---

---
✅ Keeping row 0 (original):
  "Rami I. Ibrahimi"
❌ Dropping row 24 (similar):
  "RAMI I. IBRAHIMI"
📈 Similarity score: 1.0000
---

---
✅ Keeping row 1 (original):
  "San Francisco, CA 94158 (415) 806-5906 riibrahimi@gmail.com linkedin.com/in/rami-ibrahimi
Dual Citizenship: United States & Jordan"
❌ Dropping row 9 (similar):
  "San Francisco, CA · (415) 806-5906 · riibrahimi@gmail.com · linkedin.com/in/rami-ibrahimi ·github.com/Averroes90
Dual Citizenship: United States & Jordan"
📈 Similarity score: 0.9124
---

---
✅ Keeping row 1 (original):
  "San Francisco, CA 94158 (415) 806-5906 riibrahimi@gmail.com linkedin.com/in/rami-ibrahimi
Dual Citizenship: United States & Jordan"
❌ Dropping 

In [5]:
def add_unique_ids(df):
    data_df_nona_unique_ids = df.copy()
    data_df_nona_unique_ids["id"] = (
        data_df_nona_unique_ids["id"].astype(str)
        + "_"
        + data_df_nona_unique_ids.index.astype(str)
    )
    return data_df_nona_unique_ids

In [6]:
df_unique_id_rusume = add_unique_ids(processed_df_resume)
df_unique_id_essay = add_unique_ids(processed_df_essay)
resume_data = df_unique_id_rusume.to_dict(orient="records")
essay_data = df_unique_id_essay.to_dict(orient="records")

In [7]:
from aixplain.factories import IndexFactory

# Create an index
resume_index_name = "resume_data_deduped"
resume_index_description = "resumes without duplicates or NA"
resume_index = IndexFactory.create(resume_index_name, resume_index_description)

INFO:root:Start service for GET Model  - https://platform-api.aixplain.com/sdk/models/66eae6656eb56311f2595011 - {'Authorization': 'Token 2fa34265d7ae3236f75962bef0cd4f4b5bb267b947354c80a60ed0ff40fd79e6', 'Content-Type': 'application/json'}
INFO:root:Model Creation: Model 66eae6656eb56311f2595011 instantiated.
INFO:root:Result of request: 201 - {'status': 'SUCCESS', 'completed': True, 'data': '67f7e9b97fdf75001d7733d9', 'runTime': 0.405, 'usedCredits': 1.9e-05}
INFO:root:Start service for GET Model  - https://platform-api.aixplain.com/sdk/models/67f7e9b97fdf75001d7733d9 - {'Authorization': 'Token 2fa34265d7ae3236f75962bef0cd4f4b5bb267b947354c80a60ed0ff40fd79e6', 'Content-Type': 'application/json'}
INFO:root:Model Creation: Model 67f7e9b97fdf75001d7733d9 instantiated.


In [8]:
from aixplain.modules.model.record import Record

# Prepare the records
records = [
    Record(
        value=item["content"],
        value_type="text",
        id=item["id"],
        uri="",
        attributes={"category": item["category"]},
    )
    for item in resume_data
]

# Upsert records to the index
resume_index.upsert(records)

INFO:root:Result of request: 201 - {'status': 'SUCCESS', 'completed': True, 'data': 'success', 'runTime': 0.93, 'usedCredits': 0.000159}


ModelResponse(status=SUCCESS, data='[{'data': 'Rami I. Ibrahimi', 'dataType': 'text', 'document_id': 'Ibrahimi_Rami_120722.docx_0', 'uri': '', 'attributes': {'category': 'name'}}, {'data': 'San Francisco, CA 94158 (415) 806-5906 riibrahimi@gmail.com linkedin.com/in/rami-ibrahimi\nDual Citizenship: United States & Jordan', 'dataType': 'text', 'document_id': 'Ibrahimi_Rami_120722.docx_1', 'uri': '', 'attributes': {'category': 'contact_info'}}, {'data': 'none', 'dataType': 'text', 'document_id': 'Ibrahimi_Rami_120722.docx_2', 'uri': '', 'attributes': {'category': 'technical_skills'}}, {'data': '2018-Present\tGoogle LLC\nProduct Operations T/ Program Manager – Phones (2019-Present) Mountain View, CA\nManaged oversees third party team of data scientists to deliver machine learning predictive models for return rates & manufacturing cell qualification resulting in reduction of 15% in qualification costs\nCreated and executed new process for management of OpEx product costs in tandem with cros

In [9]:
from aixplain.factories import IndexFactory

# Create an index
essay_index_name = "essay_data_deduped"
essay_index_description = "free form without duplicates or NA"
essay_index = IndexFactory.create(essay_index_name, essay_index_description)

INFO:root:Start service for GET Model  - https://platform-api.aixplain.com/sdk/models/66eae6656eb56311f2595011 - {'Authorization': 'Token 2fa34265d7ae3236f75962bef0cd4f4b5bb267b947354c80a60ed0ff40fd79e6', 'Content-Type': 'application/json'}
INFO:root:Model Creation: Model 66eae6656eb56311f2595011 instantiated.
INFO:root:Result of request: 201 - {'status': 'SUCCESS', 'completed': True, 'data': '67f7e9c77fdf75001d773417', 'runTime': 0.413, 'usedCredits': 1.8e-05}
INFO:root:Start service for GET Model  - https://platform-api.aixplain.com/sdk/models/67f7e9c77fdf75001d773417 - {'Authorization': 'Token 2fa34265d7ae3236f75962bef0cd4f4b5bb267b947354c80a60ed0ff40fd79e6', 'Content-Type': 'application/json'}
INFO:root:Model Creation: Model 67f7e9c77fdf75001d773417 instantiated.


In [10]:
from aixplain.modules.model.record import Record

# Prepare the records
records = [
    Record(
        value=item["content"],
        value_type="text",
        id=item["id"],
        uri="",
        attributes={"category": item["category"]},
    )
    for item in essay_data
]

# Upsert records to the index
essay_index.upsert(records)

INFO:root:Result of request: 201 - {'status': 'SUCCESS', 'completed': True, 'data': 'success', 'runTime': 6.997, 'usedCredits': 0.002159}


ModelResponse(status=SUCCESS, data='[{'data': 'I began by working on identifying the major loss contributors and areas for improvement and creating an action plan with targets for the next three months.  \n---\nWe struggled to keep up, especially given technical knowledge gaps in some areas. \n\n---\nShortly thereafter, my supervisor called me for a meeting to discuss the progress and results.\n---\nHi highlighted my inability to get all the team members fully engaged in the project as a major weakness.\n---\nHe said that my career progress thus far was due to my analytical rigor and hard skills and what he perceived to be as a potential for growth and explained that in order to progress further I have to address this inadequacy.', 'dataType': 'text', 'document_id': 'Rami,_Darden_E1.docx_0', 'uri': '', 'attributes': {'category': 'free-form'}}, {'data': 'He asked about the latest updates and commended all the extra efforts and commitment we have been putting in.\n---\nAs I received the 

In [11]:
from aixplain.factories import IndexFactory

index_list = IndexFactory.list()["results"]
for index in index_list:
    print(index.id, index.name)

INFO:root:Start service for POST Models Paginate - https://platform-api.aixplain.com/sdk/models/paginate - {'Authorization': 'Token 2fa34265d7ae3236f75962bef0cd4f4b5bb267b947354c80a60ed0ff40fd79e6', 'Content-Type': 'application/json'} - {"q": "", "pageNumber": 0, "pageSize": 20, "functions": ["search"]}
INFO:root:Listing Models: Status of getting Models on Page 0: 201


67f7e9c77fdf75001d773417 essay_data_deduped
67f7e9b97fdf75001d7733d9 resume_data_deduped
67cb60b0f467fb001da5c88f Test Job Docs
67f598607fdf75001d758141 combined_data_deduped
67f499df7fdf75001d74c847 combined_data_w/o_markers6
67f48ec5ff5458001d859230 combined_data_w/o_markers5
67f4856aff5458001d858012 combined_data_w/o_markers4
67f1a3d4ff5458001d82ae86 combined_data parsed
67f183a77fdf75001d71ccb1 Parsed combined
67f2f06f7fdf75001d72a9d1 combined_data_w/o_markers2
67f2e4717fdf75001d72a1e5 combined_data_w/o_markers
67f6d8d4ff5458001d873a68 Knowledge Base Index
67f1a17c7fdf75001d71dd0e Parsed combined3
67f183fcff5458001d829c49 Parsed combined2
67f477307fdf75001d7490c6 combined_data_w/o_markers3
67f091d87fdf75001d715215 Parsed Resumes
