In [1]:
!pip install --quiet openai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m71.7/73.6 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
import numpy as np
import openai
from openai.embeddings_utils import cosine_similarity
from sklearn.metrics import accuracy_score

EMBEDDING_MODEL = "text-embedding-ada-002"
OPENAI_API_KEY = "API_KEY"

openai.api_key = OPENAI_API_KEY

reference_matrix_df = pd.read_csv("ENEM_Reference_Matrix.csv")
questions_df = pd.read_csv("ENEM_tagged_questions.csv")

reference_matrix_df = reference_matrix_df.dropna()
questions_df = questions_df.dropna()

In [4]:
reference_matrix_df.head()

Unnamed: 0,Discipline,Skill_Number,Skill_Description,Skill_Description_Improved
0,"Linguagens, Códigos e suas Tecnologias",1,Identificar as diferentes linguagens e seus re...,Identificar as diferentes linguagens e seus re...
1,"Linguagens, Códigos e suas Tecnologias",2,Recorrer aos conhecimentos sobre as linguagens...,Recorrer aos conhecimentos sobre as linguagens...
2,"Linguagens, Códigos e suas Tecnologias",3,Relacionar informações geradas nos sistemas de...,Relacionar informações geradas nos sistemas de...
3,"Linguagens, Códigos e suas Tecnologias",4,Reconhecer posições críticas aos usos sociais ...,Reconhecer posições críticas aos usos sociais ...
4,"Linguagens, Códigos e suas Tecnologias",5,Associar vocábulos e expressões de um texto em...,Associar vocábulos e expressões de um texto em...


In [5]:
questions_df.head()

Unnamed: 0,Year,Discipline,SG_AREA,TX_COR,CO_PROVA,Question_Number,Question_Text,Skill_Number
0,2022,Matemática e suas Tecnologias,MT,AZUL,1075,Questão 136 - Matemática e suas Tecnologias,Uma máquina em operação tem sua temperat...,25.0
1,2022,Matemática e suas Tecnologias,MT,AZUL,1075,Questão 137 - Matemática e suas Tecnologias,A World Series é a decisão do campeonat...,28.0
2,2022,Matemática e suas Tecnologias,MT,AZUL,1075,Questão 138 - Matemática e suas Tecnologias,O gráfico apresenta os totais de receitas e de...,27.0
3,2022,Matemática e suas Tecnologias,MT,AZUL,1075,Questão 139 - Matemática e suas Tecnologias,Um casal está reformando a cozinha de c...,11.0
4,2022,Matemática e suas Tecnologias,MT,AZUL,1075,Questão 140 - Matemática e suas Tecnologias,Foram convidadas 32 equipes para um torn...,3.0


## Zero-Shot Classification

In [16]:
def get_embedding(texts, model=EMBEDDING_MODEL):
   texts = [text.replace("\n", " ") for text in texts]

   embeddings = openai.Embedding.create(input = texts, model=model)['data']
   embeddings = [embedding['embedding'] for embedding in embeddings]

   return embeddings

In [39]:
import numpy as np

def skill_prediction_pipeline(reference_matrix_df, questions_df, get_embedding):
    # Get embeddings for skills
    reference_matrix_df['Skill_Embedding'] = get_embedding((reference_matrix_df['Discipline'] + " - " + reference_matrix_df['Skill_Description_Improved']).tolist())

    # Stack skill embeddings into a matrix
    skill_matrix = np.stack(reference_matrix_df['Skill_Embedding'].values)

    # Get embeddings for all questions
    questions_df['Question_Embedding'] = get_embedding((questions_df['Discipline'] + " - " + questions_df['Question_Text']).tolist())

    # Stack question embeddings into a matrix
    question_matrix = np.stack(questions_df['Question_Embedding'].values)

    # Calculate cosine similarities between all questions and all skills
    similarities_matrix = np.dot(question_matrix, skill_matrix.T)

    # Find the skill (column) with the highest similarity for each question (row)
    top_skill_indices = np.argmax(similarities_matrix, axis=1)

    # Map these indices to skill numbers
    questions_df['Predicted_Skill'] = reference_matrix_df.iloc[top_skill_indices]['Skill_Number'].values

    # Calculate accuracy
    accuracy = np.mean(questions_df['Predicted_Skill'] == questions_df['Skill_Number'])
    print(f'Top-1 Accuracy: {accuracy}')
    return accuracy

accuracy_result = skill_prediction_pipeline(reference_matrix_df, questions_df, get_embedding)

Top-1 Accuracy: 0.20149253731343283


## Distinct distance metrics

In [23]:
from sklearn.metrics import pairwise_distances
pd.set_option('mode.chained_assignment', None)

def predict_skill_with_distance(question_embedding, reference_matrix, metric='cosine'):
    """
    Predict skill using specified distance metric.

    Parameters:
    - question_embedding: The embedding of the question.
    - reference_matrix: The DataFrame containing skill embeddings.
    - metric: The distance metric to use ('cosine', 'euclidean', 'manhattan', etc.)

    Returns:
    - Predicted skill number.
    """

    # Calculate distances between the question and all skills
    distances = pairwise_distances([question_embedding], reference_matrix['Skill_Embedding'].tolist(), metric=metric)

    # Convert distance to similarity for cosine (1 - distance)
    if metric == 'cosine':
        similarities = 1 - distances
    else:
        # For other metrics, lower distance is better
        similarities = -distances

    # Find the skill with the highest similarity (lowest distance)
    top_skill_index = np.argmax(similarities)

    # Return the predicted skill number
    return reference_matrix.iloc[top_skill_index]['Skill_Number']

questions_df.loc[:, 'Predicted_Skill_Cosine'] = questions_df['Question_Embedding'].apply(predict_skill_with_distance, reference_matrix=reference_matrix_df, metric='cosine')
questions_df.loc[:, 'Predicted_Skill_Euclidean'] = questions_df['Question_Embedding'].apply(predict_skill_with_distance, reference_matrix=reference_matrix_df, metric='euclidean')
questions_df.loc[:, 'Predicted_Skill_Manhattan'] = questions_df['Question_Embedding'].apply(predict_skill_with_distance, reference_matrix=reference_matrix_df, metric='manhattan')

# Calculate accuracy for each metric
accuracy_cosine = accuracy_score(questions_df['Skill_Number'], questions_df['Predicted_Skill_Cosine'])
accuracy_euclidean = accuracy_score(questions_df['Skill_Number'], questions_df['Predicted_Skill_Euclidean'])
accuracy_manhattan = accuracy_score(questions_df['Skill_Number'], questions_df['Predicted_Skill_Manhattan'])

print(f'Top-1 Accuracy with Cosine: {accuracy_cosine}')
print(f'Top-1 Accuracy with Euclidean: {accuracy_euclidean}')
print(f'Top-1 Accuracy with Manhattan: {accuracy_manhattan}')

Top-1 Accuracy with Cosine: 0.20149253731343283
Top-1 Accuracy with Euclidean: 0.20149253731343283
Top-1 Accuracy with Manhattan: 0.19402985074626866


TF-IDF

In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import nltk
from nltk.corpus import stopwords

# Download the stopwords from NLTK
nltk.download('stopwords')
stop_words_pt = stopwords.words('portuguese')

# 1. Prepare the data
# Concatenate the question text and discipline
questions_df['Combined_Text'] = questions_df['Discipline'] + ' ' + questions_df['Question_Text']
# Concatenate the skill description and discipline
reference_matrix_df['Combined_Description'] = reference_matrix_df['Discipline'] + ' ' + reference_matrix_df['Skill_Description_Improved']
# Combine the concatenated texts of the questions and the skills into one list
combined_texts_with_discipline = questions_df['Combined_Text'].tolist() + reference_matrix_df['Combined_Description'].tolist()

# 2. Calculate the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words_pt)
tfidf_matrix_with_discipline = tfidf_vectorizer.fit_transform(combined_texts_with_discipline)

# 3. Split the TF-IDF matrix into questions and skills
questions_matrix_with_discipline = tfidf_matrix_with_discipline[:len(questions_df)]
skills_matrix_with_discipline = tfidf_matrix_with_discipline[len(questions_df):]

# 4. Calculate the similarity scores between questions and skills
similarity_scores_with_discipline = linear_kernel(questions_matrix_with_discipline, skills_matrix_with_discipline)

# 5. Predict the skill number based on the highest similarity score
predicted_skill_indices_with_discipline = similarity_scores_with_discipline.argmax(axis=1)
predicted_skills_with_discipline = [reference_matrix_df.iloc[index]['Skill_Number'] for index in predicted_skill_indices_with_discipline]

accuracy_with_discipline = (predicted_skills_with_discipline == questions_df['Skill_Number']).mean()

print(f'Top-1 Accuracy with TF-IDF: {accuracy_with_discipline}')

Top-1 Accuracy with TF-IDF: 0.05970149253731343


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Top-k accuracy

In [56]:
def predict_top_k_skills(question_embedding, reference_matrix, k=5, metric='cosine'):
    """
    Predict top k skills using specified distance metric.

    Parameters:
    - question_embedding: The embedding of the question.
    - reference_matrix: The DataFrame containing skill embeddings.
    - k: Number of top skills to retrieve.
    - metric: The distance metric to use ('cosine', 'euclidean', 'manhattan', etc.)

    Returns:
    - List of skill numbers of top k skills.
    """

    # Calculate distances between the question and all skills
    distances = pairwise_distances([question_embedding], reference_matrix['Skill_Embedding'].tolist(), metric=metric)

    # Convert distance to similarity for cosine (1 - distance)
    if metric == 'cosine':
        similarities = 1 - distances
    else:
        # For other metrics, lower distance is better
        similarities = -distances

    # Find the indices of the top k skills with the highest similarity (lowest distance)
    top_k_indices = np.argsort(similarities[0])[-k:][::-1]

    # Get the skill numbers of the top k skills using the indices
    top_k_skill_numbers = reference_matrix.iloc[top_k_indices]['Skill_Number'].values

    return list(top_k_skill_numbers)

def compute_top_k_accuracy(true_labels, predicted_indices):
    """
    Compute the top-k accuracy.

    Parameters:
    - true_labels: Actual labels.
    - predicted_indices: Predicted top k indices for each item.

    Returns:
    - Top-k accuracy.
    """
    hits = 0
    for true, predicted in zip(true_labels, predicted_indices):
        if true in predicted:
            hits += 1

    return hits / len(true_labels)

k = 5
questions_df['Top_K_Skill_Numbers'] = questions_df['Question_Embedding'].apply(predict_top_k_skills, reference_matrix=reference_matrix_df, k=k, metric='cosine')

accuracy_top_k = compute_top_k_accuracy(questions_df['Skill_Number'].tolist(), questions_df['Top_K_Skill_Numbers'].tolist())
print(f'Top-{k} Accuracy: {accuracy_top_k}')

Top-5 Accuracy: 0.6044776119402985


## Translating to English

The files were translated with [Google Translate](https://translate.google.com.br/?sl=pt&tl=en&op=docs).

In [25]:
reference_matrix_df = pd.read_csv("ENEM_Reference_Matrix_English.csv")
questions_df = pd.read_csv("ENEM_tagged_questions_English.csv")

reference_matrix_english_df = reference_matrix_df.dropna()
questions_english_df = questions_df.dropna()

In [26]:
reference_matrix_english_df.head()

Unnamed: 0,Discipline,Skill_Number,Skill_Description,Skill_Description_Improved
0,"Languages, Codes and their Technologies",1,Identify the different languages and their exp...,Identify different languages and their express...
1,"Languages, Codes and their Technologies",2,Use knowledge about the languages of communica...,Use knowledge about the languages of communica...
2,"Languages, Codes and their Technologies",3,Relate information generated in communication ...,Relate information generated in communication ...
3,"Languages, Codes and their Technologies",4,Recognize critical positions to the social us...,Recognize critical positions on the social use...
4,"Languages, Codes and their Technologies",5,Associate words and expressions from a text in...,Associate words and expressions from a text in...


In [27]:
questions_english_df.head()

Unnamed: 0,Year,Discipline,SG_AREA,TX_COR,CO_PROOF,Question_Number,Question_Text,Skill_Number
0,2022,Mathematics and its Technologies,MT,BLUE,1075,Question 136 - Mathematics and its Technologies,A machine in operation has its temperature T m...,25.0
1,2022,Mathematics and its Technologies,MT,BLUE,1075,Question 137 - Mathematics and its Technologies,The World Series is the championship game of N...,28.0
2,2022,Mathematics and its Technologies,MT,BLUE,1075,Question 138 - Mathematics and its Technologies,The graph shows the total income and expenses ...,27.0
3,2022,Mathematics and its Technologies,MT,BLUE,1075,Question 139 - Mathematics and its Technologies,couple is renovating their kitchen at home and...,11.0
4,2022,Mathematics and its Technologies,MT,BLUE,1075,Question 140 - Mathematics and its Technologies,32 teams were invited to a football tournament...,3.0


In [28]:
accuracy_result = skill_prediction_pipeline(reference_matrix_english_df, questions_english_df, get_embedding)

Top-1 Accuracy: 0.26119402985074625


### Top-k accuracy

In [29]:
k = 5
questions_english_df['Top_K_Skill_Numbers'] = questions_english_df['Question_Embedding'].apply(predict_top_k_skills, reference_matrix=reference_matrix_english_df, k=k, metric='cosine')

accuracy_top_k = compute_top_k_accuracy(questions_english_df['Skill_Number'].tolist(), questions_english_df['Top_K_Skill_Numbers'].tolist())
print(f'Top-{k} Accuracy: {accuracy_top_k}')

Top-5 Accuracy: 0.5746268656716418


## Microconcepts

In [6]:
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)

### Generate microconcepts

In [57]:
instruction = """
Você é um especialista em criação de currículos e Design Instrucional.

Você consegue, a partir de um currículo original, com habilidades mais amplas, quebrar e sugerir conceitos mais granulares relacionados à habilidade.

Devolva sua resposta já com os conceitos menores sugeridos.

---
Alguns exemplos:
User: ["Reconhecer no contexto social diferentes significados e representações dos números e operações", "Identificar padrões numéricos ou princípios de contagem","Resolver situação-problema envolvendo conhecimentos numéricos","Avaliar a razoabilidade de um resultado numérico na construção de argumentos sobre afirmações quantitativas","Avaliar propostas de intervenção na realidade utilizando conhecimentos numéricos"]
Assistant: {
    "Reconhecer no contexto social diferentes significados e representações dos números e operações": {
        "História dos números": "Entendimento de como os números foram desenvolvidos ao longo da história em diferentes culturas.",
        "Números na cultura": "Como diferentes culturas representam e dão valor aos números.",
        "Números na religião": "Significado e uso dos números em diferentes práticas religiosas.",
        "Comércio e números": "Utilização e representação de números em sistemas de troca, preços e negociações."
    },
    "Identificar padrões numéricos ou princípios de contagem": {
        "Sequências aritméticas": "Reconhecimento e elaboração de sequências com diferença constante.",
        "Sequências geométricas": "Reconhecimento e elaboração de sequências com proporção constante.",
        "Princípios de contagem": "Fundamentos básicos como combinação, permutação e princípio fundamental da contagem.",
        "Reconhecimento de padrões": "Habilidade de visualizar e identificar padrões em conjuntos numéricos."
    },
    "Resolver situação-problema envolvendo conhecimentos numéricos": {
        "Modelagem matemática": "Traduzir problemas do mundo real para linguagem matemática.",
        "Estratégias de resolução": "Identificar e aplicar métodos adequados para resolver um problema.",
        "Aplicação prática": "Uso de conceitos numéricos em cenários cotidianos como orçamento, receitas culinárias, entre outros."
    },
    "Avaliar a razoabilidade de um resultado numérico na construção de argumentos sobre afirmações quantitativas": {
        "Estimativa": "Habilidade de fazer suposições aproximadas para verificar se uma resposta está no domínio correto.",
        "Análise crítica": "Julgamento de resultados para determinar sua validade.",
        "Construção de argumentos": "Formar e defender um argumento baseado em evidências quantitativas."
    },
    "Avaliar propostas de intervenção na realidade utilizando conhecimentos numéricos": {
        "Interpretação de dados": "Capacidade de ler e compreender informações numéricas em diversos formatos, como gráficos e tabelas.",
        "Tomada de decisão baseada em dados": "Uso de informações numéricas para escolher a melhor solução ou ação.",
        "Impacto social": "Entender como os números e dados influenciam e são influenciados pela sociedade.",
        "Planejamento e projeção": "Uso de números para planejar e prever futuros cenários ou intervenções."
    }
}
"""

# Função para obter microconceitos
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def get_microconcepts(skill_list):
    skill_string = json.dumps(skill_list)  # Convert the list to a string representation
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": instruction},
            {"role": "user", "content": skill_string}
        ]
    )
    # Parse the response string as JSON
    return json.loads(response.choices[0].message['content'])

# Criar dataframe para salvar os resultados
result_df = pd.DataFrame(columns=['Skill', 'Microconcept_Title', 'Microconcept_Description'])

# Separar habilidades em batches
batch_size = 5  # ou qualquer outro tamanho de batch desejado
batches = [reference_matrix_df['Skill_Description'][i:i+batch_size].tolist() for i in range(0, len(reference_matrix_df), batch_size)]

# Paralelizar as chamadas da API
with ThreadPoolExecutor(max_workers=10) as executor:  # max_workers é o número de threads
    results = list(tqdm(executor.map(get_microconcepts, batches), total=len(batches), desc="Processing batches"))

# Processar os resultados e salvar em um dataframe
result_df = pd.DataFrame(columns=['Skill', 'Microconcept_Title', 'Microconcept_Description'])

for response in results:
    for skill, microconcepts in response.items():
        for title, description in microconcepts.items():
            result_df = pd.concat([result_df, pd.DataFrame([{
                'Skill': skill,
                'Microconcept_Title': title,
                'Microconcept_Description': description
            }])], ignore_index=True)

# Salvar resultados no CSV
result_df.to_csv('ENEM_microconcepts.csv', index=False)


Processing batches: 100%|██████████| 5/5 [01:31<00:00, 18.27s/it]


### Measure accuracy

In [7]:
from sklearn.metrics import jaccard_score

In [14]:
microconcepts_df = pd.read_csv('ENEM_microconcepts.csv')
skills_with_microconcepts_df = pd.merge(reference_matrix_df, microconcepts_df, left_on='Skill_Description', right_on='Skill', how='inner')

skills_with_microconcepts_df.drop('Skill', axis=1, inplace=True)
skills_with_microconcepts_df.head()

Unnamed: 0,Discipline,Skill_Number,Skill_Description,Skill_Description_Improved,Microconcept_Title,Microconcept_Description
0,"Linguagens, Códigos e suas Tecnologias",1,Identificar as diferentes linguagens e seus re...,Identificar as diferentes linguagens e seus re...,Análise de linguagem verbal,Entendimento e identificação dos elementos e r...
1,"Linguagens, Códigos e suas Tecnologias",1,Identificar as diferentes linguagens e seus re...,Identificar as diferentes linguagens e seus re...,Análise de linguagem não verbal,Entendimento e identificação dos elementos e r...
2,"Linguagens, Códigos e suas Tecnologias",1,Identificar as diferentes linguagens e seus re...,Identificar as diferentes linguagens e seus re...,Consciência do papel da linguagem na comunicação,Reconhecimento de como a linguagem é usada par...
3,"Linguagens, Códigos e suas Tecnologias",2,Recorrer aos conhecimentos sobre as linguagens...,Recorrer aos conhecimentos sobre as linguagens...,Aplicação de linguagem e comunicação para a re...,Utilização do entendimento e habilidades de li...
4,"Linguagens, Códigos e suas Tecnologias",2,Recorrer aos conhecimentos sobre as linguagens...,Recorrer aos conhecimentos sobre as linguagens...,Pensamento crítico em mídia,Uso de habilidades de análise para interpretar...


In [53]:
skills_with_microconcepts_df['Microconcept_Embedding'] = get_embedding(
    (skills_with_microconcepts_df['Discipline'] + " - " + skills_with_microconcepts_df['Microconcept_Title'] + " - " + skills_with_microconcepts_df['Microconcept_Description']).tolist()
)

skills_with_microconcepts_df['Skill_Embedding'] = get_embedding(
    (skills_with_microconcepts_df['Discipline'] + " - " + skills_with_microconcepts_df['Skill_Description_Improved']).tolist()
)

questions_df['Question_Embedding'] = get_embedding(
    (questions_df['Discipline'] + " - " + questions_df['Question_Text']).tolist()
)

In [54]:
import numpy as np

def skill_prediction_pipeline(skills_with_microconcepts_df, questions_df, n_microconcepts=5):

    def top_n_microconcepts(target_embeddings, source_embeddings, n):
        # Calculate cosine similarities in a vectorized manner
        similarities = np.dot(target_embeddings, source_embeddings.T)

        # Get the indices of the top n similar items
        top_n_indices = np.argsort(similarities, axis=1)[:, -n:]

        return top_n_indices

    # Extract embeddings as matrix
    skill_embeddings = np.stack(skills_with_microconcepts_df['Skill_Embedding'].values)
    microconcept_embeddings = np.stack(skills_with_microconcepts_df['Microconcept_Embedding'].values)
    question_embeddings = np.stack(questions_df['Question_Embedding'].values)

    # Get top n microconcepts for skills and questions
    skills_top_n_indices = top_n_microconcepts(skill_embeddings, microconcept_embeddings, n_microconcepts)
    questions_top_n_indices = top_n_microconcepts(question_embeddings, microconcept_embeddings, n_microconcepts)

    skills_with_microconcepts_df['Top_N_Microconcepts_for_Skill'] = [skills_with_microconcepts_df.iloc[indices]['Microconcept_Title'].tolist() for indices in skills_top_n_indices]
    questions_df['Top_N_Microconcepts'] = [skills_with_microconcepts_df.iloc[indices]['Microconcept_Title'].tolist() for indices in questions_top_n_indices]

    skill_to_microconcepts = skills_with_microconcepts_df.set_index('Skill_Description')['Top_N_Microconcepts_for_Skill'].to_dict()

    def predict_skill(top_n_microconcepts):
        jaccard_scores = {}

        for skill, microconcepts in skill_to_microconcepts.items():
            intersect = set(top_n_microconcepts) & set(microconcepts)
            union = set(top_n_microconcepts) | set(microconcepts)
            jaccard_scores[skill] = len(intersect) / len(union)

        return max(jaccard_scores, key=jaccard_scores.get)

    questions_df['Predicted_Skill_Description'] = questions_df['Top_N_Microconcepts'].apply(predict_skill)

    # Map skill description to skill number
    skill_description_to_number = skills_with_microconcepts_df.set_index('Skill_Description')['Skill_Number'].to_dict()
    questions_df['Predicted_Skill_Number'] = questions_df['Predicted_Skill_Description'].map(skill_description_to_number)

    accuracy = np.mean(questions_df['Predicted_Skill_Number'] == questions_df['Skill_Number'])
    print(f'Accuracy: {accuracy}')

    return accuracy

accuracy_result = skill_prediction_pipeline(skills_with_microconcepts_df, questions_df, n_microconcepts=5)

Accuracy: 0.1791044776119403


## Using other models

Pode demorar consideravelmente para completar.

In [None]:
!pip install --quiet transformers[sentencepiece]

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m913.8 kB/s[0m eta [36m0:00:02[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.3 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m0.8/1.3 MB[0m [31m6.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m1.2/1.3 MB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# import numpy as np
# import pandas as pd
# from sklearn.metrics import accuracy_score
# from transformers import XLMRobertaModel, XLMRobertaTokenizer
# import torch

# # Initialize tokenizer and model
# tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')
# model = XLMRobertaModel.from_pretrained('xlm-roberta-base').eval()

# def get_embedding_xlmr(texts):
#     """
#     Generate embeddings using XLM-Roberta model.

#     Parameters:
#     - texts: List of text strings to be embedded.

#     Returns:
#     - List of embeddings.
#     """
#     # Tokenize the texts
#     inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     # Use the mean of the last hidden state as embeddings
#     embeddings = outputs.last_hidden_state.mean(dim=1)
#     return embeddings.numpy()  # Convert torch tensor to numpy array

# # Calculate the embeddings
# reference_matrix_df['Skill_Embedding'] = list(get_embedding_xlmr((reference_matrix_df['Discipline'] + " - " + reference_matrix_df['Skill_Description']).tolist()))

# def cosine_similarity(a, b):
#     """
#     Compute the cosine similarity between two vectors.

#     Parameters:
#     - a, b: numpy arrays

#     Returns:
#     - Cosine similarity value.
#     """
#     dot_product = np.dot(a, b)
#     norm_a = np.linalg.norm(a)
#     norm_b = np.linalg.norm(b)
#     return dot_product / (norm_a * norm_b)

# def predict_skill(question_embedding, reference_matrix):
#     """
#     Predict the skill based on cosine similarities between the question embedding and reference matrix embeddings.

#     Parameters:
#     - question_embedding: The embedding of the question.
#     - reference_matrix: DataFrame containing skill embeddings.

#     Returns:
#     - Predicted skill number.
#     """
#     # Calculate cosine similarities between the question and all skills
#     similarities = [cosine_similarity(question_embedding, skill_embedding) for skill_embedding in reference_matrix['Skill_Embedding']]

#     # Find the skill with the highest similarity
#     top_skill_index = np.argmax(similarities)

#     # Return the predicted skill number
#     return reference_matrix.iloc[top_skill_index]['Skill_Number']

# # Get embeddings for all questions
# questions_df['Question_Embedding'] = list(get_embedding_xlmr((questions_df['Discipline'] + " - " + questions_df['Question_Text']).tolist()))

# # Predict the skill for each question
# questions_df['Predicted_Skill'] = questions_df['Question_Embedding'].apply(predict_skill, reference_matrix=reference_matrix_df)

# # Calculate accuracy
# accuracy = accuracy_score(questions_df['Skill_Number'], questions_df['Predicted_Skill'])
# print(f'Top-1 Accuracy: {accuracy}')

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]