In [2]:
import pandas as pd
import numpy as np

In [3]:
def preprocess_text(text):
    text = text.lower().split()
    return [word.strip("°") for word in text]


def create_transition_df(tokens_t, tokens):
    unique_words = sorted(set(tokens_t))
    num_words = len(unique_words)
    transition_df = pd.DataFrame(0, index=unique_words, columns=unique_words)

    for i in range(len(tokens) - 1):
        current_word = tokens[i]
        next_word = tokens[i + 1]
        transition_df.loc[current_word, next_word] += 1

    # Normalizar las frecuencias de transición
    transition_df = transition_df.div(transition_df.sum(axis=1), axis=0)

    return transition_df


In [4]:
def cosine_ang(m1, m2):
  np_t1 = m1.to_numpy()
  np_t2 = m2.to_numpy()

  t1_T = np_t1.transpose()
  t2_T = np_t2.transpose()

  C = np.dot(t2_T, np_t1)
  prod_int = C.trace()

  normt1 = np.sqrt((np.dot(t1_T, np_t1)).trace())
  normt2 = np.sqrt((np.dot(t2_T, np_t2)).trace())

  cos_ang = prod_int/(normt1*normt2)

  return cos_ang


In [5]:
def markov(text1,text2):
  # Preprocesamiento de textos
  tokens1 = preprocess_text(text1)
  tokens2 = preprocess_text(text2)

  tokens = tokens1 + tokens2

  # Creación de DataFrames de transición
  transition_df1 = create_transition_df(tokens,tokens1).fillna(0)
  transition_df2 = create_transition_df(tokens,tokens2).fillna(0)

  return cosine_ang(transition_df1,transition_df2)

In [22]:
data = pd.read_csv('nuevo_labels.csv')


In [23]:
data['ResulComp'] = data.apply(
    lambda row: markov(row['sub1'], row['sub2']), axis=1)


In [24]:
def asignar_verdict(level, result):
    if result > level:
        return 1
    else:
        return 0


In [25]:
for i in range(10):
    name = "range>0." + str(i)
    level = i/10
    data[name] = data['ResulComp'].apply(lambda x: asignar_verdict(level, x))


In [26]:
total_registros = len(data)
for i in range(10):
    name = "range>0." + str(i)
    coincidencias = (data['verdict'] == data[name]).sum()
    precision = coincidencias / total_registros
    print("Precisión de", name, "es:", precision)


Precisión de range>0.0 es: 0.27552140504939626
Precisión de range>0.1 es: 0.5642151481888035
Precisión de range>0.2 es: 0.7552140504939627
Precisión de range>0.3 es: 0.8057080131723381
Precisión de range>0.4 es: 0.8254665203073546
Precisión de range>0.5 es: 0.8111964873765093
Precisión de range>0.6 es: 0.8079034028540066
Precisión de range>0.7 es: 0.8024149286498353
Precisión de range>0.8 es: 0.8035126234906695
Precisión de range>0.9 es: 0.8024149286498353


In [27]:
data

Unnamed: 0,sub1,sub2,problem,verdict,ResulComp,range>0.0,range>0.1,range>0.2,range>0.3,range>0.4,range>0.5,range>0.6,range>0.7,range>0.8,range>0.9
0,//package codeforces;\nimport java.io.PrintWri...,import java.util.*;\n// import java.lang.*;\ni...,19,0,0.030789,1,0,0,0,0,0,0,0,0,0
1,import java.util.*;\n\npublic class Soltion{\n...,import java.util.*;\n\npublic class mentor1 {\...,20,0,0.116149,1,1,0,0,0,0,0,0,0,0
2,import java.io.*;\nimport java.util.*;\n\npubl...,import java.io.*;\nimport java.util.*;\n\npubl...,14,1,1.000000,1,1,1,1,1,1,1,1,1,1
3,\nimport java.util.*;\nimport java.lang.*;\nim...,import java.io.OutputStream;\nimport java.io.I...,6,0,0.246946,1,1,1,0,0,0,0,0,0,0
4,import java.math.BigDecimal;\nimport java.math...,import java.util.*;\npublic class Sol\n{\n ...,15,0,0.207027,1,1,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
906,import java.io.BufferedReader;\nimport java.io...,\nimport java.io.BufferedReader;\nimport java....,8,1,0.452296,1,1,1,1,1,0,0,0,0,0
907,import javax.print.attribute.standard.MediaSiz...,\nimport java.util.Scanner;\n\npublic class Me...,4,0,0.134388,1,1,0,0,0,0,0,0,0,0
908,import java.io.*;\nimport java.util.*;\npublic...,import java.io.BufferedReader;\nimport java.io...,17,0,0.139686,1,1,0,0,0,0,0,0,0,0
909,import java.util.*;\nimport java.io.*;\n\n\npu...,import java.io.*;\nimport java.util.*;\n\npubl...,15,1,0.112135,1,1,0,0,0,0,0,0,0,0


In [28]:
datap = data


In [29]:
i = 1
for i in range(100):
    datap = datap[datap["ResulComp"] > (i/100)]
    total_registros = len(datap)
    coincidencias = (datap['verdict'] == datap["range>0.9"]).sum()
    precision = coincidencias / total_registros
    print("coincidencias", coincidencias, "total_registros", total_registros)
    print("Precisión de", name, "es:", precision, "con corte en", (i/100))


coincidencias 731 total_registros 911
Precisión de range>0.9 es: 0.8024149286498353 con corte en 0.0
coincidencias 727 total_registros 907
Precisión de range>0.9 es: 0.8015435501653804 con corte en 0.01
coincidencias 714 total_registros 891
Precisión de range>0.9 es: 0.8013468013468014 con corte en 0.02
coincidencias 699 total_registros 872
Precisión de range>0.9 es: 0.801605504587156 con corte en 0.03
coincidencias 649 total_registros 816
Precisión de range>0.9 es: 0.7953431372549019 con corte en 0.04
coincidencias 619 total_registros 781
Precisión de range>0.9 es: 0.7925736235595391 con corte en 0.05
coincidencias 582 total_registros 735
Precisión de range>0.9 es: 0.7918367346938775 con corte en 0.06
coincidencias 537 total_registros 681
Precisión de range>0.9 es: 0.788546255506608 con corte en 0.07
coincidencias 492 total_registros 624
Precisión de range>0.9 es: 0.7884615384615384 con corte en 0.08
coincidencias 455 total_registros 582
Precisión de range>0.9 es: 0.781786941580756 co

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Cargar el conjunto de datos
df = pd.read_csv('tu_dataset.csv', sep='|')

# Dividir los datos en características (X) y etiquetas (y)
X = df[['sub1', 'sub2']].values.astype(str)
y = df['verdict'].values.astype(str)

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocesamiento de texto utilizando CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train[:, 0] + ' ' + X_train[:, 1])
X_test = vectorizer.transform(X_test[:, 0] + ' ' + X_test[:, 1])

# Crear un objeto DMatrix para XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test)

# Definir los parámetros del modelo
params = {
    'objective': 'multi:softmax',  # Clasificación multiclase
    'num_class': len(set(y)),  # Número de clases
    'max_depth': 6,  # Profundidad máxima del árbol
    'eta': 0.3,  # Tasa de aprendizaje
    'silent': 1,  # Sin mensajes de progreso
    'eval_metric': 'merror'  # Métrica de evaluación
}

# Entrenar el modelo
num_rounds = 100  # Número de iteraciones (rondas)
model = xgb.train(params, dtrain, num_rounds)

# Realizar predicciones en el conjunto de prueba
predictions = model.predict(dtest)

# Calcular la precisión del modelo
accuracy = accuracy_score(y_test, predictions)
print("Precisión del modelo: %.2f%%" % (accuracy * 100))
