In [None]:
# using python3
import regex as re
import numpy as np
import pandas as pd
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import model_selection
from sklearn import svm
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import TruncatedSVD
from sklearn import cluster
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import ensemble

In [None]:
data = pd.read_csv('/kaggle/input/daigt-v2-train-dataset/train_v2_drcat_02.csv')
test = pd.read_csv('/kaggle/input/llm-detect-ai-generated-text/test_essays.csv')

In [None]:
# The "RDizzl3 Seven" is a monkier given to the (probable) seven prompts used by the hosts of the competition.
# RDizzl3_seven == False,
# Increase the number of prompts used in the training set, but not so much that it affects accuracy
# Sampling 8000 here is to control the amount of LLm text generated by non-accurate prompts
data_ = data[data.RDizzl3_seven == False].reset_index(drop=True)
data_ = data[data["label"]==1].sample(8000, random_state=8)
print(data_.shape)
data = data[data.RDizzl3_seven == True].reset_index(drop=True)
data = pd.concat([data, data_])
print(data.shape)

# check if the data is imbalanced
data['label'].value_counts()

In [None]:
X = list(data['text'].tolist())
Y = list(data['label'].tolist())
test1 = list(test['text'].tolist())

In [None]:
trainX = X

In [None]:
# Remove line breaks, extra spaces, and punctuation marks, which is preprocessing
def normalize(text):
    # Replace with whitespace
    text = text.replace(r"\n", r" ")
    text = text.replace(r"\r", r" ")
    # Drop puntuation
    text = re.sub(r"\p{P}", " ", text)
    # Remove extra spaces, remain only one remove
    text = re.sub(r"\s+", r" ", text)
    # Remove leading and trailing whitespace
    text = text.strip()
    return text

for i in range(len(trainX)):
    trainX[i] = normalize(trainX[i])
for i in range(len(test)):
    test[i] = normalize(test1[i])

In [None]:
# Feature extraction
# using Bag of Words
# setup the document vectorizer to make BoW

cntvect = CountVectorizer(stop_words='english')
cntvect.fit(trainX)

# calculate the vectors for the training data
trainX_bow = cntvect.transform(trainX)

# calculate vectors for the test data
testX_bow = cntvect.transform(test1)

In [None]:
# using TF-IDF to extract features again

# TF-IDF representation
tf_trans = TfidfTransformer(use_idf=True, norm='l1')

# setup the TF-IDF representation, and transform the training set
trainX_idf = tf_trans.fit_transform(trainX_bow)
# transform the test set
test_idf = tf_trans.transform(testX_bow)

In [None]:
# Cluster the word list to bring words with the same characteristics together to reduce the feature dimension.
# run k-means to build codebook
km = cluster.KMeans(n_clusters=50, random_state=4487, init="k-means++", n_init=5)
km.fit(trainX_idf)

In [None]:
def bow_transform(model, X):
    numwords = model.cluster_centers_.shape[0]
    bows = np.zeros((X.shape[0], numwords))
    for i in range(X.shape[0]):
        w = model.predict(X[i])
        bw = np.bincount(w, minlength=numwords)
        bows[i,:] = bw
    return bows

In [None]:
# using boaw, which is a further organized representation of features
train_b = bow_transform(km, trainX_idf)
test_b  = bow_transform(km, test_idf)

In [None]:
# VotingClassifier

# LR model
# used cross-validation to select parameters(use 5-fold cross validation)
lr = linear_model.LogisticRegressionCV(Cs=np.logspace(-4,4,20), 
                                       cv=5, 
                                       class_weight='balanced', 
                                       solver='liblinear')
lr.fit(train_b, Y)
# lr.fit(trainX_idf, trainY)
model1 = lr

In [None]:
# SGD model
sgd = SGDClassifier(max_iter=5000,
                    loss="modified_huber",
                    random_state=2)
sgd.fit(train_b, Y)
model2 = sgd

In [None]:
# NB Bernoulli model.

# used cross-validation to select parameters(use 5-fold cross validation)
paramgrid = {'alpha': (1e-4, 1e-3, 1e-2, 1e-1, 1)}
nb = model_selection.GridSearchCV(naive_bayes.BernoulliNB(),
                                  paramgrid,
                                  cv=5)

nb.fit(train_b, Y)
model3 = nb

In [None]:
ensemble_model = VotingClassifier(estimators=[('lr', model1),
                                              ('sgd', model2),
                                              ('nb', model3),
                                             ],
                                  weights=[0.2, 0.4, 0.4],
                                  voting='soft')
ensemble_model.fit(train_b, Y)

In [None]:
preds_valid = ensemble_model.predict_proba(test_b)[:, 1]

In [None]:
 pd.DataFrame({'id':test['id'], 'generated':preds_valid}).to_csv('submission.csv', index=False)