In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from tqdm.notebook import tqdm

model_id = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_id)
tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
def get_log_likelihood(text):
    """
    Calculates the average log-likelihood of a text under a given model.
    Lower score is better (less surprising).
    """
    if not text:
        return 0.0
        
    inputs = tokenizer(text, return_tensors="pt")
    
    input_ids = inputs.input_ids[:, :1024].to(device)
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        neg_log_likelihood = outputs.loss
    
    return -neg_log_likelihood.item()


In [5]:
from pathlib import Path
import numpy as np
import pandas as pd

BASE_PATH = Path('../')
DATA_PATH = BASE_PATH / 'data'
TRAIN_DIR = DATA_PATH / 'train'
TEST_DIR = DATA_PATH / 'test'
TRAIN_CSV = DATA_PATH / 'train.csv'

# Load train data
print("Loading training data...")
train_df = pd.read_csv(TRAIN_CSV)

def get_text(file_path):
    """Reads text from a file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

# Load text content into df
tqdm.pandas(desc="Loading Text 1")
train_df['text_1'] = train_df['id'].progress_apply(lambda x: get_text(TRAIN_DIR / f'article_{x:04d}' / 'file_1.txt'))

tqdm.pandas(desc="Loading Text 2")
train_df['text_2'] = train_df['id'].progress_apply(lambda x: get_text(TRAIN_DIR / f'article_{x:04d}' / 'file_2.txt'))

# Create cols for real and fake text
train_df['real_text'] = np.where(train_df['real_text_id'] == 1, train_df['text_1'], train_df['text_2'])
train_df['fake_text'] = np.where(train_df['real_text_id'] == 1, train_df['text_2'], train_df['text_1'])

print("Data Loading complete")
print(f"Train DataFrame shape: {train_df.shape}")
display(train_df.head())

Loading training data...


Loading Text 1:   0%|          | 0/95 [00:00<?, ?it/s]

Loading Text 2:   0%|          | 0/95 [00:00<?, ?it/s]

Data Loading complete
Train DataFrame shape: (95, 6)


Unnamed: 0,id,real_text_id,text_1,text_2,real_text,fake_text
0,0,1,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...,The VIRSA (Visible Infrared Survey Telescope A...,The China relay network has released a signifi...
1,1,2,China\nThe goal of this project involves achie...,The project aims to achieve an accuracy level ...,The project aims to achieve an accuracy level ...,China\nThe goal of this project involves achie...
2,2,1,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...,Scientists can learn about how galaxies form a...,Dinosaur eggshells offer clues about what dino...
3,3,2,China\nThe study suggests that multiple star s...,The importance for understanding how stars evo...,The importance for understanding how stars evo...,China\nThe study suggests that multiple star s...
4,4,2,Dinosaur Rex was excited about his new toy set...,Analyzing how fast stars rotate within a galax...,Analyzing how fast stars rotate within a galax...,Dinosaur Rex was excited about his new toy set...


In [6]:
import textstat
from collections import Counter
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))

def create_features(text):
    """Extracts a dictionary of features from a single text"""
    if not isinstance(text, str) or not text:
        return {
            'char_len': 0, 'word_count': 0, 'sent_count': 0,
            'avg_word_len': 0, 'avg_sent_len': 0, 'stopword_ratio': 0,
            'punct_count': 0, 'flesch_score': 0, 'unique_word_ratio': 0
        }
    words = text.split()
    word_count = len(words)
    char_len = len(text)

    if word_count == 0:
        return {
            'char_len': char_len, 'word_count': 0, 'sent_count': 0,
            'avg_word_len': 0, 'avg_sent_len': 0, 'stopword_ratio': 0, 'punct_count': 0, 'flesch_score': 206.835, 'unique_word_ratio': 0
        }
    
    sent_count = textstat.sentence_count(text)

    avg_word_len = sum(len(word) for word in words) / word_count
    avg_sent_len = word_count / sent_count if sent_count > 0 else 0

    stopwords_in_text = [word for word in words if word.lower() in stop_words]
    stopword_ratio = len(stopwords_in_text) / word_count

    punct_count = len(re.findall(r'[!?,.;:\-\(\)\[\]"\']', text))

    unique_word_ratio = len(set(w.lower() for w in words)) / word_count

    # Readability Scores
    flesch_score = textstat.flesch_reading_ease(text)

    features = {
        'char_len': char_len,
        'word_count': word_count,
        'sent_count': sent_count,
        'avg_word_len': avg_word_len,
        'avg_sent_len': avg_sent_len,
        'stopword_ratio': stopword_ratio,
        'punct_count': punct_count,
        'flesch_score': flesch_score,
        'unique_word_ratio': unique_word_ratio,
    }
    return features

print("Creating features for text_1 and text_2...")

# Apply to text_1
feats_1 = train_df['text_1'].progress_apply(create_features)
feats_1_df = pd.json_normalize(feats_1)
feats_1_df.columns = [f'{col}_1' for col in feats_1_df.columns]

# Apply to text_2
feats_2 = train_df['text_2'].progress_apply(create_features)
feats_2_df = pd.json_normalize(feats_2)
feats_2_df.columns = [f'{col}_2' for col in feats_2_df.columns]

# Concatenate features
feature_df = pd.concat([feats_1_df, feats_2_df], axis=1)

# Create Pairwise (Difference and Ratio) Features
print("Creating pairwise features...")
base_features = [col.replace('_1', '') for col in feats_1_df.columns]
for col in base_features:
    epsilon = 1e-6
    feature_df[f'{col}_diff'] = feature_df[f'{col}_1'] - feature_df[f'{col}_2']
    feature_df[f'{col}_ratio'] = feature_df[f'{col}_1'] / (feature_df[f'{col}_2'] + epsilon)

print("Feature engineering complete.")
display(feature_df.head())


Creating features for text_1 and text_2...


Loading Text 2:   0%|          | 0/95 [00:00<?, ?it/s]

Loading Text 2:   0%|          | 0/95 [00:00<?, ?it/s]

Creating pairwise features...
Feature engineering complete.


Unnamed: 0,char_len_1,word_count_1,sent_count_1,avg_word_len_1,avg_sent_len_1,stopword_ratio_1,punct_count_1,flesch_score_1,unique_word_ratio_1,char_len_2,...,avg_sent_len_diff,avg_sent_len_ratio,stopword_ratio_diff,stopword_ratio_ratio,punct_count_diff,punct_count_ratio,flesch_score_diff,flesch_score_ratio,unique_word_ratio_diff,unique_word_ratio_ratio
0,2196,304,9,6.226974,33.777778,0.194079,32,-4.937217,0.802632,2018,...,4.177778,1.141141,-0.076191,0.718089,-1,0.969697,-21.858967,-0.291767,0.005334,1.006689
1,3124,454,9,5.88326,50.444444,0.321586,47,-3.233476,0.698238,936,...,27.611111,2.209246,0.058812,1.223808,24,2.043478,-23.044309,-0.163218,-0.155777,0.817594
2,1139,159,4,6.169811,39.75,0.289308,16,0.219231,0.823899,801,...,-1.916667,0.954,-0.030692,0.904085,7,1.777778,-17.976952,0.012048,-0.048101,0.944838
3,1774,263,8,5.749049,32.875,0.326996,52,23.320625,0.726236,1869,...,-4.553571,0.87834,0.056004,1.206658,35,3.058823,24.651213,-17.526566,0.031579,1.045459
4,195,34,3,4.764706,11.333333,0.5,6,68.431667,0.882353,871,...,-19.416667,0.368564,0.256098,2.049992,-13,0.315789,57.016667,5.994889,0.036824,1.043551


In [7]:
from sklearn.metrics.pairwise import paired_cosine_distances
from sentence_transformers import SentenceTransformer


# Configuration
ST_MODEL_NAME = 'sentence-transformers/all-mpnet-base-v2'

print("Loading Sentence Transformer model...")
st_model = SentenceTransformer(ST_MODEL_NAME)

print("Encoding texts...")
embeddings1 = st_model.encode(train_df['text_1'].tolist(), show_progress_bar=True)
embeddings2 = st_model.encode(train_df['text_2'].tolist(), show_progress_bar=True)

print(f"Embeddings created. Shape: {embeddings1.shape}")

print("Creating semantic features")

# 1. Cosine Similarity
cosine_similarities = 1 - paired_cosine_distances(embeddings1, embeddings2)

# 2. Element wise Differences
embedding_diffs = embeddings1 - embeddings2

# Combine two a new feature DataFrame
semantic_features_df = pd.DataFrame(embedding_diffs)
semantic_features_df.columns = [f'sem_diff_{i}' for i in range(embedding_diffs.shape[1])]
semantic_features_df['cosine_sim'] = cosine_similarities

print("Semantic Features Created")
display(semantic_features_df.head())

# Create the super feature set
X_super = pd.concat([feature_df, semantic_features_df], axis=1)
y = train_df['real_text_id'] - 1

print(f"Super feature set created. Shape: {X_super.shape}")

Loading Sentence Transformer model...
Encoding texts...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Embeddings created. Shape: (95, 768)
Creating semantic features
Semantic Features Created


Unnamed: 0,sem_diff_0,sem_diff_1,sem_diff_2,sem_diff_3,sem_diff_4,sem_diff_5,sem_diff_6,sem_diff_7,sem_diff_8,sem_diff_9,...,sem_diff_759,sem_diff_760,sem_diff_761,sem_diff_762,sem_diff_763,sem_diff_764,sem_diff_765,sem_diff_766,sem_diff_767,cosine_sim
0,0.019083,0.006649,0.014612,-0.021872,0.013211,0.006887,0.01676,-0.006569,0.021447,0.004465,...,0.033263,0.005071,-0.003286,-0.022461,-0.016347,0.011188,0.02189,0.042394,0.01998,0.55133
1,-0.002131,0.100618,-0.00686,-0.015465,-0.002975,-0.005482,0.043374,0.023411,-0.042388,0.009743,...,-0.008845,-0.001277,-0.013604,0.014051,-0.015411,-0.013543,-0.029538,0.019512,-0.032364,0.610046
2,-0.073394,-0.059585,0.028701,-0.039264,0.008226,0.00281,-0.004064,0.037426,0.115684,0.022992,...,0.032491,-0.043163,0.059642,-0.021499,-0.070247,0.023989,-0.01484,-0.031483,-0.030843,0.247873
3,0.074293,0.040511,-0.004778,-0.016028,0.043253,-0.019031,0.010677,-0.001667,-0.018004,-0.009036,...,-0.05169,-0.010257,0.004975,0.017619,0.041261,-0.029366,-0.006718,-0.010197,-0.063887,0.570451
4,0.098058,0.052397,-0.012488,0.083774,0.019192,0.008235,-0.029036,-0.039514,-0.045138,0.004903,...,0.04376,-0.012135,0.006492,0.00271,0.046947,0.009172,0.004975,0.03546,-0.000592,0.048204


Super feature set created. Shape: (95, 805)


In [8]:
print("Calculating log likelihood for train set...")
tqdm.pandas(desc="Log-Likelihood Text 1")
train_df['ll_score_1'] = train_df['text_1'].progress_apply(get_log_likelihood)

tqdm.pandas(desc="Log Likelihood Text 2")
train_df['ll_score_2'] = train_df['text_2'].progress_apply(get_log_likelihood)

# Create new feature for XGBoost model
ll_features = pd.DataFrame()
ll_features['ll_score_diff'] = train_df['ll_score_1'] - train_df['ll_score_2']
ll_features['ll_score_ratio'] = train_df['ll_score_1'] / (train_df['ll_score_2'] + 1e-6)

X_super_v2 = pd.concat([X_super, ll_features], axis=1)

Calculating log likelihood for train set...


Log-Likelihood Text 1:   0%|          | 0/95 [00:00<?, ?it/s]

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
Token indices sequence length is longer than the specified maximum sequence length for this model (4338 > 1024). Running this sequence through the model will result in indexing errors


Log Likelihood Text 2:   0%|          | 0/95 [00:00<?, ?it/s]

In [9]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import xgboost as xgb

best_xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.04346,
    'max_depth': 15,
    'subsample': 0.65029,
    'colsample_bytree': 0.8286,
    'min_child_weight': 1,
    'random_state': 42,
    'n_jobs': -1,
    'early_stopping_rounds': 100  
}

N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

oof_preds_super = np.zeros(len(X_super))
models_super = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_super, y)):
    print(f"--- Fold {fold+1}/{N_SPLITS} ---")
    
    X_train, y_train = X_super.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_super.iloc[val_idx], y.iloc[val_idx]
    
    model = xgb.XGBClassifier(
        n_estimators=1000,
        **best_xgb_params
    )
    
    model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              verbose=False)
    
    preds = model.predict(X_val)
    oof_preds_super[val_idx] = preds
    models_super.append(model)
    
    acc = accuracy_score(y_val, preds)
    print(f"Fold {fold+1} Pairwise Accuracy: {acc:.5f}")

overall_accuracy_super = accuracy_score(y, oof_preds_super)
print(f"\nOverall CV Pairwise Accuracy with SUPER Features: {overall_accuracy_super:.5f}")

--- Fold 1/5 ---
Fold 1 Pairwise Accuracy: 0.78947
--- Fold 2/5 ---
Fold 2 Pairwise Accuracy: 0.89474
--- Fold 3/5 ---
Fold 3 Pairwise Accuracy: 1.00000
--- Fold 4/5 ---
Fold 4 Pairwise Accuracy: 1.00000
--- Fold 5/5 ---
Fold 5 Pairwise Accuracy: 0.94737

Overall CV Pairwise Accuracy with SUPER Features: 0.92632


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np
import lightgbm as lgb
import xgboost as xgb

N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

best_lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'random_state': 42,
    "learning_rate": 0.0442,
    "num_leaves": 35,
    "max_depth": 8,
    "subsample": 0.5160,
    "colsample_bytree": 0.5952,
    "min_child_samples": 28,
    'verbose': -1  
}

oof_preds_lgbm = np.zeros(len(X_super_v2))
for fold, (train_idx, val_idx) in enumerate(skf.split(X_super_v2, y)):
    X_train, y_train = X_super_v2.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_super_v2.iloc[val_idx], y.iloc[val_idx]

    lgb_model = lgb.LGBMClassifier(**best_lgb_params)
    lgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )
    
    oof_preds_lgbm[val_idx] = lgb_model.predict_proba(X_val)[:, 1]


best_xgb_params = {
    'random_state': 42,
    'verbosity': 0,  
    'eval_metric': 'logloss'
}

oof_preds_super_proba = np.zeros(len(X_super))
models_super = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_super, y)):
    X_train, y_train = X_super.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_super.iloc[val_idx], y.iloc[val_idx]

    xgb_model = xgb.XGBClassifier(n_estimators=1000, **best_xgb_params)
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    oof_preds_super_proba[val_idx] = xgb_model.predict_proba(X_val)[:, 1]
    models_super.append(xgb_model)

#  Blending 
best_acc = 0
best_weight = 0
for w in np.arange(0, 1.01, 0.01):
    blended_preds = (w * oof_preds_super_proba + (1-w) * oof_preds_lgbm) > 0.5
    acc = accuracy_score(y, blended_preds)
    if acc > best_acc:
        best_acc = acc
        best_weight = w

print(f"Best blending weight: {best_weight:.2f}")
print(f"Ensemble CV Accuracy: {best_acc:.5f}")

Best blending weight: 0.48
Ensemble CV Accuracy: 0.92632


In [14]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm


In [None]:
MODEL_NAME = "microsoft/deberta-v3-small"
MAX_LENGTH = 512 
BATCH_SIZE = 8
EPOCHS = 10 
LEARNING_RATE = 1e-5 
HEAD_LR = 1e-4 

In [None]:
# Custom PyTorch Dataset
class ImpostorDataset(Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.texts1 = df['text_1'].values
        self.texts2 = df['text_2'].values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        inputs1 = self.tokenizer(
            self.texts1[idx],
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_len
        )
        inputs2 = self.tokenizer(
            self.texts2[idx],
            return_tensors='pt',
            truncation=True,
            padding='max_length',
            max_length=self.max_len
        )
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        
        return {
            'input_ids1': inputs1['input_ids'].squeeze(0),
            'attention_mask1': inputs1['attention_mask'].squeeze(0),
            'input_ids2': inputs2['input_ids'].squeeze(0),
            'attention_mask2': inputs2['attention_mask'].squeeze(0),
            'labels': label
        }



In [None]:
# The Siamese Model Architecture
class SiameseDifferenceModel(nn.Module):
    def __init__(self, model_name):
        super(SiameseDifferenceModel, self).__init__()
        # The shared transformer encoder
        self.transformer = AutoModel.from_pretrained(model_name)
        
        # We need the hidden size for the MLP input
        hidden_size = self.transformer.config.hidden_size
        
        # Custom MLP head
        self.head = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size), 
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(hidden_size, 2) 
        )

    def forward(self, input_ids1, attention_mask1, input_ids2, attention_mask2):
        # Get embeddings for each text
        # We take the embedding of the [CLS] token
        out1 = self.transformer(input_ids=input_ids1, attention_mask=attention_mask1).last_hidden_state[:, 0, :]
        out2 = self.transformer(input_ids=input_ids2, attention_mask=attention_mask2).last_hidden_state[:, 0, :]
        
        # Interaction layer
        diff = torch.abs(out1 - out2)
        prod = out1 * out2
        
        combined = torch.cat([diff, prod], dim=1)
        
        logits = self.head(combined)
        return logits


In [None]:
# Main Training and CV Loop
# Reload train_df and add the label column
train_df['label'] = train_df['real_text_id'] - 1

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds_siamese = np.zeros(len(train_df))
y = train_df['label'].values

for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, y)):
    print(f"\n{'='*15} FOLD {fold+1} {'='*15}")
    
    train_data = train_df.iloc[train_idx]
    val_data = train_df.iloc[val_idx]
    
    train_dataset = ImpostorDataset(train_data, tokenizer, MAX_LENGTH)
    val_dataset = ImpostorDataset(val_data, tokenizer, MAX_LENGTH)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = SiameseDifferenceModel(MODEL_NAME).to(device)
    
    # Differential learning rates
    optimizer_grouped_parameters = [
        {"params": model.transformer.parameters(), "lr": LEARNING_RATE},
        {"params": model.head.parameters(), "lr": HEAD_LR},
    ]
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * EPOCHS
    )
    criterion = nn.CrossEntropyLoss()
    
    best_val_acc = 0
    
    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            optimizer.zero_grad()
            input_ids1 = batch['input_ids1'].to(device)
            attention_mask1 = batch['attention_mask1'].to(device)
            input_ids2 = batch['input_ids2'].to(device)
            attention_mask2 = batch['attention_mask2'].to(device)
            labels = batch['labels'].to(device)
            
            logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
            loss = criterion(logits, labels)
            total_loss += loss.item()
            
            loss.backward()
            optimizer.step()
            scheduler.step()
        
        # Validation
        model.eval()
        val_preds = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids1 = batch['input_ids1'].to(device)
                attention_mask1 = batch['attention_mask1'].to(device)
                input_ids2 = batch['input_ids2'].to(device)
                attention_mask2 = batch['attention_mask2'].to(device)
                
                logits = model(input_ids1, attention_mask1, input_ids2, attention_mask2)
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                val_preds.extend(preds)
        
        val_acc = accuracy_score(val_data['label'], val_preds)
        print(f"Epoch {epoch+1} - Val Acc: {val_acc:.4f}")
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            oof_preds_siamese[val_idx] = val_preds






Epoch 1:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 - Val Acc: 0.5263


Epoch 2:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2 - Val Acc: 0.4737


Epoch 3:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3 - Val Acc: 0.5263


Epoch 4:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 4 - Val Acc: 0.4737


Epoch 5:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 5 - Val Acc: 0.4737


Epoch 6:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 6 - Val Acc: 0.4737


Epoch 7:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 7 - Val Acc: 0.4737


Epoch 8:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 8 - Val Acc: 0.4737


Epoch 9:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 9 - Val Acc: 0.4737


Epoch 10:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 10 - Val Acc: 0.4737



Epoch 1:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 - Val Acc: 0.4737


Epoch 2:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2 - Val Acc: 0.4737


Epoch 3:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3 - Val Acc: 0.4737


Epoch 4:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 4 - Val Acc: 0.4211


Epoch 5:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 5 - Val Acc: 0.4737


Epoch 6:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 6 - Val Acc: 0.4737


Epoch 7:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 7 - Val Acc: 0.4737


Epoch 8:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 8 - Val Acc: 0.4737


Epoch 9:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 9 - Val Acc: 0.4737


Epoch 10:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 10 - Val Acc: 0.4737



Epoch 1:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 - Val Acc: 0.5263


Epoch 2:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2 - Val Acc: 0.4737


Epoch 3:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3 - Val Acc: 0.5263


Epoch 4:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 4 - Val Acc: 0.5263


Epoch 5:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 5 - Val Acc: 0.5263


Epoch 6:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 6 - Val Acc: 0.5263


Epoch 7:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 7 - Val Acc: 0.5263


Epoch 8:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 8 - Val Acc: 0.5263


Epoch 9:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 9 - Val Acc: 0.5263


Epoch 10:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 10 - Val Acc: 0.5263



Epoch 1:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 - Val Acc: 0.5263


Epoch 2:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2 - Val Acc: 0.5263


Epoch 3:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3 - Val Acc: 0.5263


Epoch 4:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 4 - Val Acc: 0.4737


Epoch 5:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 5 - Val Acc: 0.6842


Epoch 6:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 6 - Val Acc: 0.5263


Epoch 7:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 7 - Val Acc: 0.5263


Epoch 8:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 8 - Val Acc: 0.4737


Epoch 9:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 9 - Val Acc: 0.4737


Epoch 10:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 10 - Val Acc: 0.4737



Epoch 1:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1 - Val Acc: 0.5263


Epoch 2:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 2 - Val Acc: 0.4737


Epoch 3:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 3 - Val Acc: 0.5263


Epoch 4:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 4 - Val Acc: 0.4737


Epoch 5:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 5 - Val Acc: 0.4737


Epoch 6:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 6 - Val Acc: 0.5263


Epoch 7:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 7 - Val Acc: 0.6842


Epoch 8:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 8 - Val Acc: 0.4737


Epoch 9:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 9 - Val Acc: 0.4737


Epoch 10:   0%|          | 0/10 [00:00<?, ?it/s]

Epoch 10 - Val Acc: 0.4737


In [None]:
final_cv_acc = accuracy_score(y, oof_preds_siamese)
print(f"\nOverall Siamese Network CV Accuracy: {final_cv_acc:.5f}")


Overall Siamese Network CV Accuracy: 0.57895


In [24]:
# --- 1. LOAD DATA (with added check) ---
print("Step 1: Loading Data...")
# ... (your existing data loading code) ...

# Load Train Data
train_df = pd.read_csv(TRAIN_CSV)
train_df['text_1'] = train_df['id'].apply(lambda x: get_text(TRAIN_DIR / str(x) / 'file_1.txt'))
train_df['text_2'] = train_df['id'].apply(lambda x: get_text(TRAIN_DIR / str(x) / 'file_2.txt'))
train_df = train_df[(train_df['text_1'].str.len() > 0) & (train_df['text_2'].str.len() > 0)].reset_index(drop=True)
y = train_df['real_text_id'] - 1

def generate_super_features(df, st_model):
    # Structural features
    feats_1_df = pd.json_normalize(df['text_1'].apply(create_features))
    feats_2_df = pd.json_normalize(df['text_2'].apply(create_features))
    feats_1_df.columns = [f'{c}_1' for c in feats_1_df.columns]
    feats_2_df.columns = [f'{c}_2' for c in feats_2_df.columns]
    feature_df = pd.concat([feats_1_df, feats_2_df], axis=1)
    base_features = [c.replace('_1', '') for c in feats_1_df.columns]
    for col in base_features:
        feature_df[f'{col}_diff'] = feature_df[f'{col}_1'] - feature_df[f'{col}_2']
        feature_df[f'{col}_ratio'] = feature_df[f'{col}_1'] / (feature_df[f'{col}_2'] + 1e-6)
        
# --- ADD THIS CHECK ---
if train_df.empty:
    raise ValueError("Training DataFrame is empty after filtering. Please check if file paths are correct and text files are being read.")

# ... (rest of your code) ...

print("Step 2: Engineering Features...")
st_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
X_super_train = generate_super_features(train_df, st_model) # This will no longer crash

Step 1: Loading Data...


ValueError: Training DataFrame is empty after filtering. Please check if file paths are correct and text files are being read.

# Final Submission

In [None]:
# FINAL SUBMISSION SCRIPT

import pandas as pd
import numpy as np
import xgboost as xgb
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import paired_cosine_distances
from pathlib import Path
from tqdm.notebook import tqdm
import textstat
import re

print("Step 1: Loading Data...")
BASE_PATH = Path('../') 
DATA_PATH = BASE_PATH / 'data'
TRAIN_DIR = DATA_PATH / 'train'
TEST_DIR = DATA_PATH / 'test'
TRAIN_CSV = DATA_PATH / 'train.csv'

def get_text(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f: return f.read().strip()
    except: return ""

train_df = pd.read_csv(TRAIN_CSV)
train_df['text_1'] = train_df['id'].apply(lambda x: get_text(TRAIN_DIR / f'article_{x:04d}' / 'file_1.txt'))
train_df['text_2'] = train_df['id'].apply(lambda x: get_text(TRAIN_DIR / f'article_{x:04d}' / 'file_2.txt'))
train_df = train_df[(train_df['text_1'].str.len() > 0) & (train_df['text_2'].str.len() > 0)].reset_index(drop=True)
y = train_df['real_text_id'] - 1

test_dirs = [d for d in TEST_DIR.iterdir() if d.is_dir()]
test_ids = sorted([int(d.name.split('_')[1]) for d in test_dirs])

test_df = pd.DataFrame({'id': test_ids})
test_df['text_1'] = test_df['id'].apply(lambda x: get_text(TEST_DIR / f'article_{x:04d}' / 'file_1.txt'))
test_df['text_2'] = test_df['id'].apply(lambda x: get_text(TEST_DIR / f'article_{x:04d}' / 'file_2.txt'))

print("Step 2: Engineering Features...")

def create_features(text):
    if not isinstance(text, str) or not text:
        return {'char_len': 0, 'word_count': 0, 'sent_count': 0, 'avg_word_len': 0, 'avg_sent_len': 0, 'flesch_score': 0}
    words = text.split()
    word_count = len(words)
    if word_count == 0:
        return {'char_len': len(text), 'word_count': 0, 'sent_count': 0, 'avg_word_len': 0, 'avg_sent_len': 0, 'flesch_score': 206.835}
    sent_count = textstat.sentence_count(text)
    return {
        'char_len': len(text), 'word_count': word_count, 'sent_count': sent_count,
        'avg_word_len': sum(len(w) for w in words) / word_count,
        'avg_sent_len': word_count / sent_count if sent_count > 0 else 0,
        'flesch_score': textstat.flesch_reading_ease(text),
    }

def generate_super_features(df, st_model):
    # Structural features
    feats_1_df = pd.json_normalize(df['text_1'].apply(create_features))
    feats_2_df = pd.json_normalize(df['text_2'].apply(create_features))
    feats_1_df.columns = [f'{c}_1' for c in feats_1_df.columns]
    feats_2_df.columns = [f'{c}_2' for c in feats_2_df.columns]
    feature_df = pd.concat([feats_1_df, feats_2_df], axis=1)
    base_features = [c.replace('_1', '') for c in feats_1_df.columns]
    for col in base_features:
        feature_df[f'{col}_diff'] = feature_df[f'{col}_1'] - feature_df[f'{col}_2']
        feature_df[f'{col}_ratio'] = feature_df[f'{col}_1'] / (feature_df[f'{col}_2'] + 1e-6)
    
    # Semantic features
    embeddings1 = st_model.encode(df['text_1'].tolist(), show_progress_bar=True)
    embeddings2 = st_model.encode(df['text_2'].tolist(), show_progress_bar=True)
    cosine_similarities = 1 - paired_cosine_distances(embeddings1, embeddings2)
    embedding_diffs = embeddings1 - embeddings2
    semantic_features_df = pd.DataFrame(embedding_diffs, columns=[f'sem_diff_{i}' for i in range(embedding_diffs.shape[1])])
    semantic_features_df['cosine_sim'] = cosine_similarities
    
    return pd.concat([feature_df, semantic_features_df], axis=1)

st_model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
X_super_train = generate_super_features(train_df, st_model)
X_super_test = generate_super_features(test_df, st_model)
X_super_test = X_super_test[X_super_train.columns] # Ensure column order matches

print("Step 3: Training Final Model and Predicting...")
best_xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'learning_rate': 0.04346,
    'max_depth': 15,
    'subsample': 0.65029,
    'colsample_bytree': 0.8286,
    'min_child_weight': 1,
    'random_state': 42,
    'n_jobs': -1,
}
final_model = xgb.XGBClassifier(**best_xgb_params)
final_model.fit(X_super_train, y)
test_predictions = final_model.predict(X_super_test)

# CREATE SUBMISSION FILE
print("Step 4: Creating Submission File...")
submission_df = pd.DataFrame({'id': test_df['id']})
submission_df['real_text_id'] = test_predictions + 1
submission_df.to_csv('submission_v2.csv', index=False)

print("\nProject Complete. 'submission_v2.csv' has been generated.")
display(submission_df.head())

Step 1: Loading Data...
Step 2: Engineering Features...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Batches:   0%|          | 0/34 [00:00<?, ?it/s]

Step 3: Training Final Model and Predicting...
Step 4: Creating Submission File...

Project Complete. 'submission_v2.csv' has been generated.


Unnamed: 0,id,real_text_id
0,0,1
1,1,2
2,2,1
3,3,1
4,4,2
