# Conjuntos de datos

In [30]:
import pandas as pd

df = pd.read_csv('corpus/authorship_heavy.csv')

# Clasificador sin ngramas

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk


# Suponiendo que 'frases' es tu lista de frases y 'etiquetas' es una lista de 0s y 1s
frases = df['Phrase']
etiquetas = df['Author'].copy()

# Preprocesamiento de texto
vectorizador = TfidfVectorizer(max_features=1000)
X = vectorizador.fit_transform(frases)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.20, random_state=42)

# Entrenar el modelo
modelo = LogisticRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
predicciones = modelo.predict(X_test)
print(classification_report(y_test, predicciones))


              precision    recall  f1-score   support

           0       0.81      0.62      0.70        21
           1       0.67      0.84      0.74        19

    accuracy                           0.73        40
   macro avg       0.74      0.73      0.72        40
weighted avg       0.74      0.72      0.72        40



# Clasificador ngramas tradicionales

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

frases = df['Phrase']
etiquetas = df['Author'].copy()


# Vectorización con n-gramas tradicionales (bigramas)
vectorizador = TfidfVectorizer(ngram_range=(1, 2))  # Utiliza bigramas
X = vectorizador.fit_transform(frases)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.25, random_state=42)

# Entrenar el modelo
modelo = LogisticRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
predicciones = modelo.predict(X_test)
print(classification_report(y_test, predicciones))


              precision    recall  f1-score   support

           0       0.90      0.33      0.49        27
           1       0.55      0.96      0.70        23

    accuracy                           0.62        50
   macro avg       0.73      0.64      0.59        50
weighted avg       0.74      0.62      0.58        50



# Clasificador ngramas sintacticos

In [33]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Función para extraer n-gramas sintácticos
def extract_sintactic_ngrams(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    tag_ngrams = ngrams([tag for word, tag in tags], 2)  # bigramas sintácticos
    return ' '.join(['_'.join(pair) for pair in tag_ngrams])

# Vectorización con n-gramas sintácticos
vectorizador = TfidfVectorizer(preprocessor=extract_sintactic_ngrams)

frases = df['Phrase']
etiquetas = df['Author'].copy()

X = vectorizador.fit_transform(frases)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.25, random_state=42)

# Entrenar el modelo
modelo = LogisticRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
predicciones = modelo.predict(X_test)
print(classification_report(y_test, predicciones))



              precision    recall  f1-score   support

           0       0.64      0.59      0.62        27
           1       0.56      0.61      0.58        23

    accuracy                           0.60        50
   macro avg       0.60      0.60      0.60        50
weighted avg       0.60      0.60      0.60        50

