In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold


In [2]:
train_dir='data/train'
train_csv='data/train.csv'

labels = pd.read_csv(train_csv)
examples= []
for idx, row in labels.iterrows():
    article_id = f"article_{row['id']:04d}"
    article_path =os.path.join(train_dir, article_id)
    file1_path =os.path.join(article_path,'file_1.txt')
    file2_path =os.path.join(article_path,'file_2.txt')
    with open(file1_path,'r', encoding='utf-8') as f1:
        text1=f1.read()
    with open(file2_path,'r', encoding='utf-8') as f2:
        text2=f2.read()
    examples.append({
        'id': row['id'],
        'text_1':text1,
        'text_2': text2,
        'real_text_id':row['real_text_id']
    })
train_df =pd.DataFrame(examples)
print("Train loaded", train_df.shape)

Train loaded (95, 4)


In [3]:
test_dir ='data/test'
test_ids = sorted([fn for fn in os.listdir(test_dir) if fn.startswith("article_")])
test_examples = []
for article_id in test_ids:
    article_path = os.path.join(test_dir, article_id)
    file1_path = os.path.join(article_path, 'file_1.txt')
    file2_path = os.path.join(article_path, 'file_2.txt')
    with open(file1_path, 'r',encoding='utf-8') as f1:
        text1 = f1.read()
    with open(file2_path, 'r',encoding='utf-8') as f2:
        text2 = f2.read()
    test_examples.append({
        'id': int(article_id.split('_')[1]),
        'text_1': text1,
        'text_2': text2,
    })
test_df =pd.DataFrame(test_examples)
print("Test loaded", test_df.shape)

Test loaded (1068, 3)


In [4]:
all_texts = train_df['text_1'].tolist() + train_df['text_2'].tolist() + test_df['text_1'].tolist() + test_df['text_2'].tolist()
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,3), stop_words='english')
tfidf.fit(all_texts)

def text_pair_feats(df):
    v1 = tfidf.transform(df['text_1'])
    v2 =tfidf.transform(df['text_2'])
    diff = v1 -v2
    concat = np.hstack([v1.toarray(), v2.toarray(), diff.toarray()])
    return concat

X = text_pair_feats(train_df)
y = (train_df['real_text_id'] ==1).astype(int).values
X_test = text_pair_feats(test_df)

In [5]:
oof_lr=np.zeros(len(X))
test_lr=np.zeros(len(X_test))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in skf.split(X, y):
    model =LogisticRegression(max_iter=500, C=2.0)
    model.fit(X[train_idx], y[train_idx])
    oof_lr[val_idx] = model.predict_proba(X[val_idx])[:, 1]
    test_lr += model.predict_proba(X_test)[:, 1] / skf.n_splits

print(f"TFIDF+LR OOF CV: {np.mean((oof_lr > 0.5) == y):.4f}")


TFIDF+LR OOF CV: 0.8842


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

In [10]:
Model_Name="roberta-base"
tokenizer= AutoTokenizer.from_pretrained(Model_Name)

class PairDataset(Dataset):
    def __init__(self,df, tokenizer, is_test=False):
        self.df= df.reset_index(drop=True)
        self.tokenizer=tokenizer
        self.is_test=is_test

    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        pair=self.tokenizer(row['text_1'], row['text_2'], truncation=True, 
                              padding='max_length', max_length=200, return_tensors='pt')
        item ={k:v.squeeze() for k, v in pair.items()}
        if not self.is_test:
            label = 1 if row['real_text_id'] ==1 else 0
            item['labels'] = torch.tensor(label)
        return item

N_SPLITS=5
oof_tf=np.zeros(len(train_df))
test_tf=np.zeros(len(test_df))

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df,y)):
    print(f"Fold {fold+1}")
    train_fold= train_df.iloc[train_idx].reset_index(drop=True)
    val_fold=train_df.iloc[val_idx].reset_index(drop=True)
    train_dataset=PairDataset(train_fold,tokenizer)
    val_dataset=PairDataset(val_fold, tokenizer)
    test_dataset=PairDataset(test_df, tokenizer,is_test=True)

    model =AutoModelForSequenceClassification.from_pretrained(Model_Name, num_labels=2)
    training_args=TrainingArguments(
        output_dir=f'./results_{fold}',
        num_train_epochs=3,
        per_device_train_batch_size=2,
        learning_rate=2e-5,
        save_strategy='no',
        fp16=torch.cuda.is_available(),
    )
    trainer =Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
    )
    trainer.train()
    oof_logits=trainer.predict(val_dataset).predictions
    oof_tf[val_idx] =torch.softmax(torch.tensor(oof_logits), dim=1).numpy()[:,1]

    test_logits =trainer.predict(test_dataset).predictions
    test_tf +=torch.softmax(torch.tensor(test_logits), dim=1).numpy()[:,1] / N_SPLITS

print(f"transformer oof cv: {np.mean((oof_tf>0.5)== y):.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Fold 1


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Fold 2


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Fold 3


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Fold 4


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


Fold 5


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


transformer oof cv: 0.5684


In [None]:
# this make our lr based model is good and we are gonna do submission 