In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn import svm
import gensim # take text for clean and tokenize list of words
from gensim.parsing.preprocessing import STOPWORDS
import nltk
nltk.download('stopwords') #language package for english
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anano\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df_complete= pd.read_csv('dataset.csv',index_col=0)

In [3]:
# Crear un diccionario de mapeo
mapping = {'Human-Generated-Text': 0, 'AI-Generated-Text': 1}

In [4]:
#Reemplazar las clases en la columna 'class' con el mapeo

df_complete['class'] = df_complete['class'].map(mapping)

In [5]:
df_complete['len_text'] = df_complete['text'].str.len()

In [6]:
stop_words = list(stopwords.words("english"))

In [7]:
import gensim
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Inicializamos PorterStemmer
ps = PorterStemmer()

# Cargamos las stopwords en inglés
stop_words = set(stopwords.words('english'))

# Función para limpiar y procesar el texto
def limpiar_texto(text):
    # Verificar si el texto no es nulo o vacío
    if isinstance(text, str):
        # Eliminar saltos de línea y múltiples espacios
        text = re.sub(r'\s+', ' ', text)  # Reemplaza saltos de línea y tabs por un espacio
        text = text.strip()  # Elimina espacios en blanco iniciales y finales

        # Convertir a palabras en minúsculas y filtrar stopwords
        words = [
            ps.stem(word) for word in gensim.utils.simple_preprocess(text)
            if word not in gensim.parsing.preprocessing.STOPWORDS and word not in stop_words
        ]
        return ' '.join(words)
    else:
        return None  # Devuelve None si el texto es inválido

# Aplicar la función de limpieza al DataFrame
df_complete['text_cleaned'] = df_complete['text'].apply(limpiar_texto)

# Eliminar filas donde el texto limpio es None o vacío
completed = df_complete[df_complete['text_cleaned'].notnull() & (df_complete['text_cleaned'] != '')]

# Reiniciar el índice del DataFrame después de eliminar las filas
completed.reset_index(drop=True, inplace=True) 



### Train test split

In [26]:
# Separate features and target
X = completed['text_cleaned']
y = completed['class']
X

0         sekhukhun matseb circa septemb known sekhukhun...
1         mount washington peak white mountain new hamps...
2         acer hillsi extinct mapl speci endem central a...
3         derrick georg sherwin april octob english tele...
4         window shell graphic user interfac microsoft w...
                                ...                        
299995    outserv magazin bi monthli digit print magazin...
299996    eastern armenia arevelyan hayastan eastern par...
299997    infin group privat equiti fund manag compani c...
299998    kattinaker ಕಟ ನಕ call sagadd villag belgaum di...
299999    wei yan die courtesi jiaji imperi offici serv ...
Name: text_cleaned, Length: 300000, dtype: object

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
from scipy import stats

In [15]:
X

0         sekhukhun matseb circa septemb known sekhukhun...
1         mount washington peak white mountain new hamps...
2         acer hillsi extinct mapl speci endem central a...
3         derrick georg sherwin april octob english tele...
4         window shell graphic user interfac microsoft w...
                                ...                        
299995    outserv magazin bi monthli digit print magazin...
299996    eastern armenia arevelyan hayastan eastern par...
299997    infin group privat equiti fund manag compani c...
299998    kattinaker ಕಟ ನಕ call sagadd villag belgaum di...
299999    wei yan die courtesi jiaji imperi offici serv ...
Name: text_cleaned, Length: 300000, dtype: object

In [None]:
# Initialize the vectorizer
vect = CountVectorizer()

# Define models
models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'XGBoost': XGBClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

# Prepare results DataFrame
results = []

# Vectorize the text data
X_text_vectorized = vect.fit_transform(X)
# Luego, usa directamente la matriz dispersa en train_test_split y en los modelos
X_train, X_test, y_train, y_test = train_test_split(X_text_vectorized, y, test_size=0.2, random_state=42)


# Iterate through different scenarios
for scaling in ['No Scaling', 'Standardization', 'Normalization']:

    # Scaling
    if scaling == 'Standardization':
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    elif scaling == 'Normalization':
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
    else:
        X_train_scaled = X_train
        X_test_scaled = X_test

    # Train and evaluate models
    for model_name, model in models.items():
        # Skip Naive Bayes if standardization is applied
        if model_name == 'Naive Bayes' and scaling == 'Standardization':
            continue

        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        results.append({
            'Model': model_name,
            'Scaling': scaling,
            'Accuracy': accuracy
        })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print summary
print(results_df)

# Find best performing model
best_model = results_df.loc[results_df['Accuracy'].idxmax()]
print("\nBest performing model:")
print(best_model)

# Optional: Save results to CSV
results_df.to_csv('model_comparison_results.csv', index=False)
