In [8]:
from keybert import KeyBERT

/kaggle/input/inputs-folder/KeyBERT/KeyBERT




In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from torch.nn.parallel import DataParallel
import pickle
from tqdm.notebook import tqdm
from sentence_transformers import SentenceTransformer
from transformers import DistilBertTokenizerFast


In [31]:
Kmodel = KeyBERT()

In [14]:
# with open('/kaggle/working/Kmodel.pkl','wb') as f:
#     pickle.dump(Kmodel,f)

In [15]:
def extract_keywords(texts):
    keywords = [Kmodel.extract_keywords(text, stop_words=None,top_n=int(0.7*len(text.split()))) for text in tqdm(texts)]
    return np.array([' '.join(k[0] for k in keyword_list) for keyword_list in keywords])



In [16]:
def tokenize_and_prepare_dataloader(X, y, tokenizer, max_length=512, batch_size=32):
    encoded_data = tokenizer(X, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], torch.tensor(y))
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader

In [17]:
criterion = torch.nn.CrossEntropyLoss()

In [18]:
def train_model(model, train_dataloader, optimizer, epochs=5):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model=DataParallel(model)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        total_correct = 0

        for batch in tqdm(train_dataloader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            loss=criterion(logits.float(),labels.long())

            total_loss += loss.item()

            logits = logits.detach().cpu().numpy()
            predictions = np.argmax(logits, axis=1)
            labels = labels.cpu().numpy()
            total_correct += np.sum(predictions == labels)

            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(train_dataloader)
        accuracy = total_correct / len(train_dataloader.dataset)

        print(f"Epoch {epoch + 1}/{epochs} - Loss: {avg_loss:.4f} - Accuracy: {accuracy:.4f}")


In [19]:
def evaluate_model(model, test_dataloader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model=DataParallel(model)
    model.eval()

    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(test_dataloader):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            logits = logits.detach().cpu().numpy()
            predictions = np.argmax(logits, axis=1)
            labels = labels.cpu().numpy()

            all_predictions.extend(predictions)
            all_labels.extend(labels)

    accuracy = accuracy_score(all_labels, all_predictions)
    print(f"Test Accuracy: {accuracy:.4f}")



In [20]:
X_train=np.load('/kaggle/input/final-inputs/X_train.npy',allow_pickle=True)
X_test=np.load('/kaggle/input/inputs-folder/X_test.npy',allow_pickle=True)
y_train=np.load('/kaggle/input/final-inputs/y_train.npy')
y_test=np.load('/kaggle/input/inputs-folder/y_test.npy')

In [21]:
keywords_train = np.array(extract_keywords(X_train)

In [22]:
tokenizer = DistilBertTokenizerFast.from_pretrained('/kaggle/input/inputs-folder/tokenizer_/tokenizer_')

In [23]:
p = [str(x) for x in keywords_train]


# p=np.load('/kaggle/input/final-inputs/train_d.npy')

In [25]:
# np.save('/kaggle/working/train_d.npy',np.array(p))

In [26]:
dataset = TensorDataset(encoded_data['input_ids'], encoded_data['attention_mask'], torch.tensor(y_train))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [27]:
train_dataloader = tokenize_and_prepare_dataloader(p, y_train, tokenizer)


In [28]:
# model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model=torch.load('/kaggle/input/last-model/distilbert_model.pt')
optimizer = AdamW(model.parameters(), lr=5e-5)

In [23]:
train_model(model, dataloader, optimizer, epochs=3)

In [None]:
torch.save(model,'/kaggle/working/distilbert_model.pt')

In [24]:
keywords_test = extract_keywords(X_test)
# keywords_test=np.load('/kaggle/input/keywords-test/keywords_test.npy')

In [25]:
test=[str(x) for x in keywords_test]

In [26]:
test_dataloader = tokenize_and_prepare_dataloader(test, y_test, tokenizer)



In [27]:
evaluate_model(model, test_dataloader)

In [28]:
import pandas as pd
DATA_DIR='/kaggle/input/llm-detect-ai-generated-text/'
df_train_essays = pd.read_csv(DATA_DIR + "train_essays.csv")

In [29]:
df_test_essays = pd.read_csv(DATA_DIR + "test_essays.csv")
final_test_data=np.array(df_test_essays['text'])


In [30]:
final_test_data=extract_keywords(final_test_data)

  0%|          | 0/3 [00:00<?, ?it/s]

In [31]:
tokenized_final=tokenizer(final_test_data.tolist(),padding=True,truncation=True,return_tensors='pt')

In [32]:
out=[]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model=DataParallel(model)
model.eval()
with torch.no_grad():
    for i in tqdm(range(tokenized_final['input_ids'].shape[0])):
        input_ids,attention_mask=tokenized_final['input_ids'][i],tokenized_final['attention_mask'][i]
        input_ids,attention_mask=input_ids.to(device),attention_mask.to(device)
        output=model(input_ids.unsqueeze(0),attention_mask=attention_mask.unsqueeze(0))
        probabilities = torch.softmax(output.logits, dim=1)
        proba=probabilities[:,1].item()
        out.append(proba)
        i+=1

  0%|          | 0/3 [00:00<?, ?it/s]

In [33]:
sub=df_test_essays.drop('prompt_id',axis=1)
submission=sub.drop('text',axis=1)
submission['generated']=out

In [34]:
submission.to_csv('/kaggle/working/submission.csv')
submission

Unnamed: 0,id,generated
0,0000aaaa,0.992865
1,1111bbbb,0.996645
2,2222cccc,0.996645
