In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.feature_engineering import combine_columns
from src.recruiter import find_top_applicants_with_filters, retrieve_top_applicants

In [17]:
df = pd.read_parquet('data/processed/prospects.parquet')

In [18]:
df['codigo_list'] = df['prospects'].apply(lambda x: [item['codigo'] for item in x])
df['situacao_candidato'] = df['prospects'].apply(lambda x: [item['situacao_candidado'] for item in x])

In [19]:
df_exploded = df.explode(['codigo_list','situacao_candidato'])
df_exploded

Unnamed: 0,prospects_id,titulo,modalidade,prospects,codigo_list,situacao_candidato
0,4530,CONSULTOR CONTROL M,,"[{'codigo': '25632', 'comentario': 'Encaminhad...",25632,Encaminhado ao Requisitante
0,4530,CONSULTOR CONTROL M,,"[{'codigo': '25632', 'comentario': 'Encaminhad...",25529,Encaminhado ao Requisitante
1,4531,2021-2607395-PeopleSoft Application Engine-Dom...,,"[{'codigo': '25364', 'comentario': 'Data de In...",25364,Contratado pela Decision
1,4531,2021-2607395-PeopleSoft Application Engine-Dom...,,"[{'codigo': '25364', 'comentario': 'Data de In...",25360,Encaminhado ao Requisitante
2,4532,,,[],,
...,...,...,...,...,...,...
14218,14219,,,[],,
14219,14220,Consultor Sênior Especialista SAP LES-TRA - 1433,,"[{'codigo': '16828', 'comentario': 'Recebeu a ...",16828,Desistiu
14219,14220,Consultor Sênior Especialista SAP LES-TRA - 1433,,"[{'codigo': '16828', 'comentario': 'Recebeu a ...",15042,Encaminhado ao Requisitante
14220,14221,Consultor Sênior Oracle EPM FCCS - 1434,,"[{'codigo': '49190', 'comentario': '', 'data_c...",49190,Prospect


In [20]:
relevant_keywords = ['Encaminhado ao Requisitante', 'Contratado pela Decision',
       'Documentação PJ', 'Aprovado', 'Entrevista Técnica',
       'Em avaliação pelo RH', 'Contratado como Hunting',
       'Entrevista com Cliente','Documentação CLT', 'Documentação Cooperado',
       'Encaminhar Proposta',
       'Proposta Aceita']
df_exploded['relevant'] = df_exploded['situacao_candidato'].apply(lambda x: 1 if x in relevant_keywords else 0)

In [21]:
hired_keywords = [ 'Contratado pela Decision', 'Aprovado',
       'Contratado como Hunting', 'Encaminhar Proposta',
       'Proposta Aceita']
df_exploded['hired'] = df_exploded['situacao_candidato'].apply(lambda x: 1 if x in hired_keywords else 0)
df_exploded

Unnamed: 0,prospects_id,titulo,modalidade,prospects,codigo_list,situacao_candidato,relevant,hired
0,4530,CONSULTOR CONTROL M,,"[{'codigo': '25632', 'comentario': 'Encaminhad...",25632,Encaminhado ao Requisitante,1,0
0,4530,CONSULTOR CONTROL M,,"[{'codigo': '25632', 'comentario': 'Encaminhad...",25529,Encaminhado ao Requisitante,1,0
1,4531,2021-2607395-PeopleSoft Application Engine-Dom...,,"[{'codigo': '25364', 'comentario': 'Data de In...",25364,Contratado pela Decision,1,1
1,4531,2021-2607395-PeopleSoft Application Engine-Dom...,,"[{'codigo': '25364', 'comentario': 'Data de In...",25360,Encaminhado ao Requisitante,1,0
2,4532,,,[],,,0,0
...,...,...,...,...,...,...,...,...
14218,14219,,,[],,,0,0
14219,14220,Consultor Sênior Especialista SAP LES-TRA - 1433,,"[{'codigo': '16828', 'comentario': 'Recebeu a ...",16828,Desistiu,0,0
14219,14220,Consultor Sênior Especialista SAP LES-TRA - 1433,,"[{'codigo': '16828', 'comentario': 'Recebeu a ...",15042,Encaminhado ao Requisitante,1,0
14220,14221,Consultor Sênior Oracle EPM FCCS - 1434,,"[{'codigo': '49190', 'comentario': '', 'data_c...",49190,Prospect,0,0


In [22]:
df_exploded = df_exploded.rename(columns={'codigo_list':'applicant_id'
                            })

In [23]:
df_applicants = pd.read_parquet('data/processed/applicants.parquet')
applicants_columns_to_combine = [
    'titulo_profissional', 'objetivo_profissional', 'area_atuacao',
    'conhecimentos_tecnicos', 'certificacoes', 'nivel_profissional',
    'nivel_academico', 'cursos', 'cv_pt',
    'nivel_ingles', 'nivel_espanhol'
]
df_applicants = combine_columns(df_applicants, applicants_columns_to_combine)
df_applicants = df_applicants.rename(columns={'applicants_id':'applicant_id'})
df_exploded = df_exploded.merge(df_applicants[['applicant_id', 'text']], on='applicant_id', how='left')

In [24]:
df_exploded = df_exploded[['prospects_id','applicant_id','relevant','hired', 'text']].dropna(subset=['applicant_id']).rename(columns={'text':'applicants_text'})

In [25]:
df_jobs = pd.read_parquet('data/processed/vagas.parquet')

In [26]:
vagas_columns_to_combine = [
    'titulo_vaga', 'nivel profissional', 'nivel_academico',
    'nivel_ingles', 'nivel_espanhol', 'areas_atuacao',
    'principais_atividades', 'competencia_tecnicas_e_comportamentais',
    'habilidades_comportamentais_necessarias'
]

In [27]:
df_jobs = combine_columns(df_jobs, vagas_columns_to_combine)

In [28]:
df_jobs = df_jobs[['jobs_id','text']]

In [29]:
df_test = df_exploded.merge(df_jobs, left_on='prospects_id', right_on='jobs_id', how='left')

In [30]:
from src.embedding_manager import calculate_similarity
# Replace NaN values in the 'applicants_text' column with an empty string
df_test['applicants_text'] = df_test['applicants_text'].fillna('')
df_test['text'] = df_test['text'].fillna('')


In [32]:
df_test = df_test.sample(frac=0.1, random_state=42)


In [33]:
df_test.size

37632

In [37]:
# Calculate similarity and save progress
df_temp = pd.DataFrame()
batch_size = 100  # Define the batch size for saving progress
for i in range(0, len(df_test), batch_size):
    df_test_batch = df_test.iloc[i:i+batch_size]
    df_test_batch['similarity'] = df_test_batch.apply(
        lambda row: calculate_similarity(row['text'], row['applicants_text']), axis=1
    )
    # Save the processed batch back to the main dataframe
    df_temp = pd.concat([df_temp, df_test_batch], ignore_index=True)
        # Save the intermediate results to a file


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_batch['similarity'] = df_test_batch.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_batch['similarity'] = df_test_batch.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test_batch['similarity'] = df_test_batch.apply(
A value is trying to be set on a copy of a slice fro

KeyboardInterrupt: 

In [38]:
df_temp

Unnamed: 0,prospects_id,applicant_id,relevant,hired,applicants_text,jobs_id,text,similarity
0,1938,18301,1,0,titulo_profissional: objetivo_profissional: ...,1938,titulo_vaga: SAP ABAP - 20207041623 nivel prof...,0.726363
1,9754,37578,1,0,titulo_profissional: Android/IOS objetivo_prof...,9754,titulo_vaga: Software Developer (iOS or Androi...,0.591533
2,1127,14978,1,0,titulo_profissional: objetivo_profissional: ...,1127,titulo_vaga: Desenvolvedor .Net (TypeScript) -...,0.533866
3,8799,34486,0,0,titulo_profissional: Desenvolvedor IOS objetiv...,8799,titulo_vaga: Android Dev - Senior nivel profis...,0.641164
4,12392,26568,0,0,titulo_profissional: Consultor SAP PP QM objet...,12392,titulo_vaga: SAP PP/QM - SV-124 nivel profissi...,0.781615
...,...,...,...,...,...,...,...,...
695,11670,17912,1,0,titulo_profissional: objetivo_profissional: ...,11670,titulo_vaga: SAP ABAP/PI nivel profissional: A...,0.803015
696,4209,22192,0,0,titulo_profissional: objetivo_profissional: ...,4209,titulo_vaga: Senior DevOps Engineer nivel prof...,0.521762
697,3740,22905,1,0,,3740,titulo_vaga: Desenvolvedor Web .Net/C# - Junio...,0.015633
698,5547,27311,0,0,"titulo_profissional: Consultor Oracle, Analist...",5547,titulo_vaga: Analista Sistema Oracle Sênior. n...,0.724671


In [39]:
df_grouped = df_temp.groupby('prospects_id').agg(
    applicant_id = ("applicant_id", list),
    num_applicants = ('applicant_id', 'count'),
    relevant = ('relevant', list),
    hired = ('hired', list),
    text = ('text', 'first'),
    mean_similarity = ('similarity', list),
).reset_index()



In [40]:
from src.recruiter import RecruiterBot
import os, sys, yaml
from src.embedding_manager import EmbeddingManager
from src.indexer import FAISSIndexer

with open(os.path.join( "src", "config", "index_config.yaml")) as f:
    index_cfg = yaml.safe_load(f)
emb_mgr = EmbeddingManager(config_path=os.path.join( "models_config.yaml"))
indexer = FAISSIndexer(index_cfg)
# load your applicants_df here (parquet/csv)
bot = RecruiterBot( emb_mgr, indexer)

In [41]:
df_grouped['top_applicants'] = df_grouped.apply(lambda row:find_top_applicants_with_filters(
                job_description=row['text'],
                faiss_indexer=indexer,
                emb_mgr=emb_mgr,
                filters={},
                top_n=row['num_applicants']), axis=1)



In [42]:
df_grouped['top_applicants_ids'] = df_grouped['top_applicants'].apply(lambda x: [item['applicant_id'] for item in x if 'metadata' in item and 'jobs_id' in item['metadata']])
df_grouped['top_applicants_scores'] = df_grouped['top_applicants'].apply(lambda x: [item['score'] for item in x if 'score' in item])

In [43]:
df_grouped.head(1)

Unnamed: 0,prospects_id,applicant_id,num_applicants,relevant,hired,text,mean_similarity,top_applicants,top_applicants_ids,top_applicants_scores
0,10002,[38343],1,[1],[0],titulo_vaga: Salesforce / Desenvolvedor 132712...,[0.6670516729354858],"[{'applicant_idx': 4127, 'applicant_id': '1117...",[11178],[1.0000001192092896]


In [44]:
df_validation = df_grouped[['prospects_id','applicant_id','top_applicants_ids','relevant','hired', 'mean_similarity', 'top_applicants_scores']]

In [45]:
df_validation['hired_similarity'] = df_validation.apply(
	lambda row: [applicant_id for hired, applicant_id in zip(row['hired'], row['mean_similarity']) if hired == 1],
	axis=1
)
df_validation['relevant_similarity'] = df_validation.apply(
	lambda row: [applicant_id for relevant, applicant_id in zip(row['relevant'], row['mean_similarity']) if relevant == 1],
	axis=1
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validation['hired_similarity'] = df_validation.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_validation['relevant_similarity'] = df_validation.apply(


In [46]:
df_validation.head(1)

Unnamed: 0,prospects_id,applicant_id,top_applicants_ids,relevant,hired,mean_similarity,top_applicants_scores,hired_similarity,relevant_similarity
0,10002,[38343],[11178],[1],[0],[0.6670516729354858],[1.0000001192092896],[],[0.6670516729354858]


## calculate score relevant and hired

In [47]:
df_validation['mean_prospects_similarity'] = df_validation['mean_similarity'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)
df_validation['mean_top_applicants_score'] = df_validation['top_applicants_scores'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)
df_validation['mean_hired_similarity'] = df_validation['hired_similarity'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)
df_validation['mean_relevant_similarity'] = df_validation['relevant_similarity'].apply(lambda x: sum(x) / len(x) if len(x) > 0 else 0)

In [48]:
df_validation = df_validation[['prospects_id', 'applicant_id', 'top_applicants_ids',  'mean_prospects_similarity',
       'mean_top_applicants_score', 'mean_hired_similarity',
       'mean_relevant_similarity']]

In [49]:
df_validation.to_parquet('data/processed/validation.parquet', index=False)

In [50]:
print(f"Mean hired similarity: {df_validation['mean_hired_similarity'].mean():.4f}")
print(f"Mean relevant similarity: {df_validation['mean_relevant_similarity'].mean():.4f}")
print(f"Mean prospects similarity: {df_validation['mean_prospects_similarity'].mean():.4f}")
print(f"Mean top applicants score: {df_validation['mean_top_applicants_score'].mean():.4f}")

Mean hired similarity: 0.0404
Mean relevant similarity: 0.2210
Mean prospects similarity: 0.5337
Mean top applicants score: 1.0000
