In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datasets import load_dataset
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
import tiktoken

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score, ConfusionMatrixDisplay, confusion_matrix
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from spacy.lang.en.stop_words import STOP_WORDS as en_stop

import re

In [2]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/clovinux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/clovinux/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def lemma_tokenize(doc):
    wnl = WordNetLemmatizer()
    return [wnl.lemmatize(t) for t in word_tokenize(doc)]

def byte_pair_tokenize(doc):
    enc = tiktoken.encoding_for_model("gpt-4")
    tokens = enc.encode(doc)
    return [str(token) for token in tokens]

In [4]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,text,label,src
0,White girls very rarely date Asian men. Even i...,1,cmv_human
1,I am a 23 year old male Indian American male. ...,1,cmv_human
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human
3,(A) Work part-time in high school; Then go to ...,1,cmv_human
4,When police introduce a new form of speed prev...,1,cmv_human
...,...,...,...
319066,Noisy Intermediate-Scale Quantum (NISQ) machin...,1,sci_gen_human
319067,Recent years have seen rising needs for locati...,1,sci_gen_human
319068,The ongoing neural revolution in machine trans...,1,sci_gen_human
319069,Let D be a set of n pairwise disjoint unit dis...,1,sci_gen_human


In [5]:
df['tokenized_sents'] = df.apply(lambda row: word_tokenize(row['text']), axis=1)

In [6]:
df

Unnamed: 0,text,label,src,tokenized_sents
0,White girls very rarely date Asian men. Even i...,1,cmv_human,"[White, girls, very, rarely, date, Asian, men,..."
1,I am a 23 year old male Indian American male. ...,1,cmv_human,"[I, am, a, 23, year, old, male, Indian, Americ..."
2,"Take three people, Persons A, B, and C. They l...",1,cmv_human,"[Take, three, people, ,, Persons, A, ,, B, ,, ..."
3,(A) Work part-time in high school; Then go to ...,1,cmv_human,"[(, A, ), Work, part-time, in, high, school, ;..."
4,When police introduce a new form of speed prev...,1,cmv_human,"[When, police, introduce, a, new, form, of, sp..."
...,...,...,...,...
319066,Noisy Intermediate-Scale Quantum (NISQ) machin...,1,sci_gen_human,"[Noisy, Intermediate-Scale, Quantum, (, NISQ, ..."
319067,Recent years have seen rising needs for locati...,1,sci_gen_human,"[Recent, years, have, seen, rising, needs, for..."
319068,The ongoing neural revolution in machine trans...,1,sci_gen_human,"[The, ongoing, neural, revolution, in, machine..."
319069,Let D be a set of n pairwise disjoint unit dis...,1,sci_gen_human,"[Let, D, be, a, set, of, n, pairwise, disjoint..."


In [8]:
all_tokens = df['tokenized_sents'].apply(lambda x: ' '.join(x))

In [9]:
all_tokens_list = ' '.join(all_tokens).split()

num_unique_tokens = len(set(all_tokens_list))

print("Number of unique tokens:", num_unique_tokens)

Number of unique tokens: 561673
