# Conjuntos de datos

In [50]:
import pandas as pd

df = pd.read_csv('corpus/authorship_light.csv')

# Clasificador sin ngramas

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import nltk


# Frase y etiqueta
frases = df['Phrase']
etiquetas = df['Author'].copy()

# Preprocesamiento de texto
vectorizador = TfidfVectorizer(max_features=1000)
X = vectorizador.fit_transform(frases)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.20, random_state=42)

# Entrenar el modelo
modelo = LogisticRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
predicciones = modelo.predict(X_test)
print(classification_report(y_test, predicciones))


              precision    recall  f1-score   support

           0       0.87      0.62      0.72        21
           1       0.68      0.89      0.77        19

    accuracy                           0.75        40
   macro avg       0.77      0.76      0.75        40
weighted avg       0.78      0.75      0.75        40



# Clasificador ngramas tradicionales

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

frases = df['Phrase']
etiquetas = df['Author'].copy()


# Vectorización con n-gramas tradicionales (bigramas)
vectorizador = TfidfVectorizer(ngram_range=(1, 2))  # Utiliza bigramas
X = vectorizador.fit_transform(frases)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.25, random_state=42)

# Entrenar el modelo
modelo = LogisticRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
predicciones = modelo.predict(X_test)
print(classification_report(y_test, predicciones))


              precision    recall  f1-score   support

           0       0.86      0.44      0.59        27
           1       0.58      0.91      0.71        23

    accuracy                           0.66        50
   macro avg       0.72      0.68      0.65        50
weighted avg       0.73      0.66      0.64        50



# Clasificador ngramas sintacticos

In [54]:
import nltk
from nltk import word_tokenize, pos_tag
from nltk.util import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Función para extraer n-gramas sintácticos
def extract_sintactic_ngrams(text):
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    tag_ngrams = ngrams([tag for word, tag in tags], 2)  # bigramas sintácticos
    return ' '.join(['_'.join(pair) for pair in tag_ngrams])

# Vectorización con n-gramas sintácticos
vectorizador = TfidfVectorizer(preprocessor=extract_sintactic_ngrams)

frases = df['Phrase']
etiquetas = df['Author'].copy()

X = vectorizador.fit_transform(frases)

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X, etiquetas, test_size=0.25, random_state=42)

# Entrenar el modelo
modelo = LogisticRegression()
modelo.fit(X_train, y_train)

# Evaluar el modelo
predicciones = modelo.predict(X_test)
print(classification_report(y_test, predicciones))



              precision    recall  f1-score   support

           0       0.53      0.59      0.56        27
           1       0.45      0.39      0.42        23

    accuracy                           0.50        50
   macro avg       0.49      0.49      0.49        50
weighted avg       0.49      0.50      0.50        50

