In [3]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

test = pd.read_csv('/root/03-S_NLP/DetectAI/00-data/test_essays.csv')
sub = pd.read_csv('/root/03-S_NLP/DetectAI/00-data/sample_submission.csv')
org_train = pd.read_csv('/root/03-S_NLP/DetectAI/00-data/train_essays.csv')
train = pd.read_csv("/root/03-S_NLP/DetectAI/00-data/train_v2_drcat_02.csv", sep=',')

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 학습데이터 'text' 기준 중복 제거
train = train.drop_duplicates(subset=['text'])
train.reset_index(drop=True, inplace=True)

In [6]:
from readability import Readability
import seaborn as sns
import matplotlib.pyplot as plt
# 가독성 점수
'''
flesch_kincaid : 미국 학교 수준의 가독성 등급 (낮을수록 좋은거)
flesch : 평균 문장 길이와 단어당 평균 음절 수를 기준으로 가독성 점수를 제공 (높을수록)
gunning_fog : 텍스트를 이해하는 데 필요한 정규 교육 기간을 추정 (낮을수록)
coleman_liau : 텍스트를 이해하는 데 필요한 대략적인 미국 학교 학년 수준을 계산, 단어와 문장당 평균 문자 수를 고려
dale_chall : 대부분의 4학년 학생들에게 친숙한 단어 목록을 고려하여 텍스트 가독성을 평가 (낮을수록)
ari : Flesch-Kincaid 학년 수준과 유사하게 단어당 문자와 ​​문장당 단어를 기준으로 텍스트를 이해하는 데 필요한 대략적인 미국 학교 학년 수준을 계산
linsear_write : 텍스트의 단순 단어와 복잡한 단어 수를 기준으로 가독성을 측정
spache : Dale-Chall 가독성 점수와 유사하게 초기 독자를 위한 친숙한 단어 목록을 기반으로 가독성을 평가
'''
def extract_scores(text):
    try:
        r = Readability(text)
        scores =  [r.flesch_kincaid().score, r.flesch().score, r.gunning_fog().score, r.coleman_liau().score, r.dale_chall().score, r.ari().score, r.linsear_write().score, r.spache().score]
        return [round(x, 3) + 1000 for x in scores]
    except:
        return [0]*8
train = train.sample(10_00, random_state = 1)


In [7]:
train['scores'] = train['text'].map(extract_scores)
scores_train = train['scores'].apply(pd.Series)
scores_train.columns = [ f"scores_{x}" for x in scores_train.columns]
gc.collect()

80

In [8]:
train

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven,scores
16750,My position on driverless cars is I think they...,0,Driverless cars,persuade_corpus,True,"[1008.197, 1072.834, 1011.825, 1007.246, 1007...."
43217,Cars have been part of our daily lives for dec...,0,Car-free cities,train_essays,True,"[1009.875, 1057.344, 1011.564, 1010.262, 1007...."
25569,"Getting advice from multiple people is great, ...",0,Seeking multiple opinions,persuade_corpus,False,"[1010.775, 1058.194, 1013.894, 1010.076, 1007...."
18847,Getting rid of the electoral college would be ...,0,Does the electoral college work?,persuade_corpus,True,"[1010.597, 1054.07, 1013.142, 1009.896, 1008.8..."
22981,"Throughout my life, I have wondered what it wo...",0,Distance learning,persuade_corpus,False,"[1009.184, 1059.156, 1011.265, 1011.384, 1008...."
...,...,...,...,...,...,...
11363,I don't believe a technology can identify huma...,0,Facial action coding system,persuade_corpus,True,"[1013.344, 1057.511, 1016.793, 1006.182, 1008...."
31992,"Sure, here's my attempt at writing an essay as...",1,Seeking multiple opinions,llama2_chat,False,"[1006.422, 1079.22, 1008.468, 1007.122, 1006.5..."
39935,"Sure, here's my attempt at writing an essay as...",1,Summer projects,llama2_chat,False,"[1006.564, 1078.934, 1009.449, 1006.75, 1006.4..."
3102,The debate on the pros and cons of car usage h...,0,Car-free cities,persuade_corpus,True,"[1007.188, 1064.93, 1009.736, 1008.981, 1009.1..."


In [9]:
test['scores'] = test['text'].map(extract_scores)
scores_test = test['scores'].apply(pd.Series)
scores_test.columns = [ f"scores_{x}" for x in scores_test.columns]
gc.collect()

21

In [10]:
LOWERCASE = False
VOCAB_SIZE = 30522

In [11]:
# Creating Byte-Pair Encoding tokenizer
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)
dataset = Dataset.from_pandas(test[['text']])

def train_corp_iter(): 
    for i in range(0, len(dataset), 1000):
        yield dataset[i : i + 1000]["text"]

    
raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['text'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))

tokenized_texts_train = []

for text in tqdm(train['text'].tolist()):
    tokenized_texts_train.append(tokenizer.tokenize(text))






100%|██████████| 3/3 [00:00<00:00, 2673.23it/s]
100%|██████████| 1000/1000 [00:01<00:00, 558.38it/s]


In [17]:
tokenized_texts_test

[['ĠAaa', 'Ġbbb', 'Ġccc', '.'],
 ['ĠBbb', 'Ġccc', 'Ġddd', '.'],
 ['ĠCCC', 'Ġddd', 'Ġeee', '.']]

In [12]:
def dummy(text):
    return text

# scikit-learn의 TF-IDF 벡터화
vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, analyzer = 'word',
    tokenizer = dummy,
    preprocessor = dummy,
    token_pattern = None, strip_accents='unicode')

vectorizer.fit(tokenized_texts_test)

# Getting vocab
vocab = vectorizer.vocabulary_

print(vocab)

vectorizer = TfidfVectorizer(ngram_range=(3, 5), lowercase=False, sublinear_tf=True, vocabulary=vocab,
                            analyzer = 'word',
                            tokenizer = dummy,
                            preprocessor = dummy,
                            token_pattern = None, strip_accents='unicode'
                            )

tf_train = vectorizer.fit_transform(tokenized_texts_train)
tf_test = vectorizer.transform(tokenized_texts_test)

del vectorizer
gc.collect()

{'ĠAaa Ġbbb Ġccc': 0, 'Ġbbb Ġccc .': 6, 'ĠAaa Ġbbb Ġccc .': 1, 'ĠBbb Ġccc Ġddd': 2, 'Ġccc Ġddd .': 7, 'ĠBbb Ġccc Ġddd .': 3, 'ĠCCC Ġddd Ġeee': 4, 'Ġddd Ġeee .': 8, 'ĠCCC Ġddd Ġeee .': 5}


23

In [13]:
from scipy.sparse import csr_matrix, hstack

df_sparse = csr_matrix(scores_train.values)
tf_train = hstack([df_sparse, tf_train])

df_sparse = csr_matrix(scores_test.values)
tf_test = hstack([df_sparse, tf_test])
gc.collect()

21

In [14]:
y_train = train['label'].values

In [15]:
if len(test.text.values) <= 5:
    sub.to_csv('submission.csv', index=False)
else:
    clf = MultinomialNB(alpha=0.02)
#     clf2 = MultinomialNB(alpha=0.01)
    sgd_model = SGDClassifier(max_iter=8000, tol=1e-4, loss="modified_huber") 
    p6={'n_iter': 1500,'verbose': -1,'objective': 'cross_entropy','metric': 'auc',
        'learning_rate': 0.05073909898961407, 'colsample_bytree': 0.726023996436955,
        'colsample_bynode': 0.5803681307354022, 'lambda_l1': 8.562963348932286, 
        'lambda_l2': 4.893256185259296, 'min_data_in_leaf': 115, 'max_depth': 23, 'max_bin': 898}
    lgb=LGBMClassifier(**p6)
    cat=CatBoostClassifier(iterations=1000,
                           verbose=0,
                           l2_leaf_reg=6.6591278779517808,
                           learning_rate=0.005689066836106983,
                           allow_const_label=True,loss_function = 'CrossEntropy')
    weights = [0.07,0.31,0.31,0.31]
 
    ensemble = VotingClassifier(estimators=[('mnb',clf),
                                            ('sgd', sgd_model),
                                            ('lgb',lgb), 
                                            ('cat', cat)
                                           ],
                                weights=weights, voting='soft', n_jobs=-1)
    ensemble.fit(tf_train, y_train)
    gc.collect()
    final_preds = ensemble.predict_proba(tf_test)[:,1]
    sub['generated'] = final_preds
    sub.to_csv('submission.csv', index=False)
    sub

In [16]:
sub

Unnamed: 0,id,generated
0,0000aaaa,0.1
1,1111bbbb,0.9
2,2222cccc,0.4
