In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.metrics import accuracy_score
import gensim
from gensim.parsing.preprocessing import STOPWORDS
import nltk
nltk.download('stopwords') #language package for english
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anano\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Carga y preprocesamiento de datos

In [2]:
df_complete= pd.read_csv('dataset.csv',index_col=0)

In [3]:
# Crear un diccionario de mapeo
mapping = {'Human-Generated-Text': 0, 'AI-Generated-Text': 1}

In [4]:
#Reemplazar las clases en la columna 'class' con el mapeo

df_complete['class'] = df_complete['class'].map(mapping)

In [5]:
df_complete['len_text'] = df_complete['text'].str.len()

In [19]:
df_sample, _ = train_test_split(df_complete, train_size=50000, stratify=df_complete['class'], random_state=42)

### Limpieza

In [20]:
stop_words = list(stopwords.words("english"))

In [21]:
# Inicializamos PorterStemmer
ps = PorterStemmer()

# Cargamos las stopwords en inglés
stop_words = set(stopwords.words('english'))

# Función para limpiar y procesar el texto
def limpiar_texto(text):
    # Verificar si el texto no es nulo o vacío
    if isinstance(text, str):
        # Eliminar saltos de línea y múltiples espacios
        text = re.sub(r'\s+', ' ', text)  # Reemplaza saltos de línea y tabs por un espacio
        text = text.strip()  # Elimina espacios en blanco iniciales y finales

        # Convertir a palabras en minúsculas y filtrar stopwords
        words = [
            ps.stem(word) for word in gensim.utils.simple_preprocess(text)
            if word not in gensim.parsing.preprocessing.STOPWORDS and word not in stop_words
        ]
        return ' '.join(words)
    else:
        return None  # Devuelve None si el texto es inválido

# Aplicar la función de limpieza al DataFrame
df_sample['text_cleaned'] = df_sample['text'].apply(limpiar_texto)

# Eliminar filas donde el texto limpio es None o vacío
completed = df_sample[df_sample['text_cleaned'].notnull() & (df_sample['text_cleaned'] != '')]

# Reiniciar el índice del DataFrame después de eliminar las filas
completed.reset_index(drop=True, inplace=True) 



### Train test split

In [22]:
# Separate features and target
X = completed['text_cleaned']
y = completed['class']
X

0        king island roxystar record ab known roxi reco...
1        kati carr british singer songwrit musician bor...
2        peter alexand greenlaw quaif born kinn decemb ...
3        mercuri poison type metal poison exposur mercu...
4        parablenniu divers genu combtooth blenni atlan...
                               ...                        
49995    fender super reverb guitar amplifi manufactur ...
49996    gen livia illustri plebeian famili roman repub...
49997    fritz haber decemb januari german chemist rece...
49998    gujarat titan franchis cricket team base ahmed...
49999    aspen knoll estat privat commun staten island ...
Name: text_cleaned, Length: 50000, dtype: object

### Pre cross validation

In [24]:

# Initialize the vectorizer
vect = CountVectorizer()

# Define models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Prepare results DataFrame
results = []

# Vectorize the text data
X_text_vectorized = vect.fit_transform(X)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_text_vectorized, y, test_size=0.2, random_state=42)

# Iterate through different scaling scenarios
for scaling in ['No Scaling', 'Standardization', 'Normalization']:

    # Scaling
    if scaling == 'Standardization':
        scaler = StandardScaler(with_mean=False)  # StandardScaler works with sparse input but set with_mean=False
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    elif scaling == 'Normalization':
        scaler = MaxAbsScaler()  # MaxAbsScaler is compatible with sparse data
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test

    # Train and evaluate models
    for model_name, model in models.items():
        # Skip Naive Bayes if Standardization is applied
        if model_name == 'Naive Bayes' and scaling == 'Standardization':
            continue

        # Train model
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        # Store results
        results.append({
            'Model': model_name,
            'Scaling': scaling,
            'Accuracy': accuracy
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print summary of results
print(results_df)

# Find best performing model
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest performing model:")
print(best_model)

# Optional: Save results to CSV
results_df.to_csv('model_comparison_results.csv', index=False)


                  Model          Scaling  Accuracy
0           Naive Bayes       No Scaling    0.7095
1   Logistic Regression       No Scaling    0.8926
2         Decision Tree       No Scaling    0.7607
3         Random Forest       No Scaling    0.8499
4                   SVM       No Scaling    0.9045
5               XGBoost       No Scaling    0.8881
6                   KNN       No Scaling    0.5177
7   Logistic Regression  Standardization    0.7651
8         Decision Tree  Standardization    0.7606
9         Random Forest  Standardization    0.8496
10                  SVM  Standardization    0.7278
11              XGBoost  Standardization    0.8881
12                  KNN  Standardization    0.5074
13          Naive Bayes    Normalization    0.6640
14  Logistic Regression    Normalization    0.8979
15        Decision Tree    Normalization    0.7604
16        Random Forest    Normalization    0.8495
17                  SVM    Normalization    0.7476
18              XGBoost    Norm

Finalmente me quedo con el modelo X-Gboost, dado que los resutlados son bastantes similares a SVM