In [1]:
!pip install transformers
!pip install torch




In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Bidirectional, GRU, Dropout, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import backend as K
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import transformers
import warnings
warnings.filterwarnings('ignore')

# CUSTOM ATTENTION LAYER FOR SEQUENCE MODELING
class AttentionLayer(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.W = self.add_weight(name='attention_weight', 
                                shape=(input_shape[-1], 1),
                                initializer='random_normal',
                                trainable=True)
        self.b = self.add_weight(name='attention_bias',
                                shape=(input_shape[1], 1),
                                initializer='zeros',
                                trainable=True)
        super(AttentionLayer, self).build(input_shape)

    def call(self, x):
        e = K.tanh(K.dot(x, self.W) + self.b)
        a = K.softmax(e, axis=1)
        output = x * a
        return K.sum(output, axis=1)

# TEXT ENCODING FUNCTION FOR BERT TOKENIZATION
def bert_encode(texts, tokenizer, max_len=160):
    all_tokens = []
    all_masks = []
    
    for text in texts:
        text = str(text)
        tokens = tokenizer.tokenize(text)[:max_len-2]
        tokens = ['[CLS]'] + tokens + ['[SEP]']
        
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        attention_mask = [1] * len(input_ids)
        
        pad_len = max_len - len(input_ids)
        input_ids += [0] * pad_len
        attention_mask += [0] * pad_len
        
        all_tokens.append(input_ids[:max_len])
        all_masks.append(attention_mask[:max_len])
    
    return np.array(all_tokens), np.array(all_masks)

# BUILD ADVANCED NEURAL NETWORK MODEL
def build_advanced_model(transformer, max_len=160):
    input_ids = Input(shape=(max_len,), dtype=tf.int32, name='input_ids')
    sequence_output = transformer(input_ids)[0]
    
    gru_output = Bidirectional(GRU(64, return_sequences=True, dropout=0.2, 
                                 recurrent_dropout=0.2))(sequence_output)
    attention_output = AttentionLayer()(gru_output)
    normalized = LayerNormalization()(attention_output)
    
    x = Dense(128, activation='relu')(normalized)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.2)(x)
    
    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_ids, outputs=out)
    optimizer = Adam(learning_rate=2e-5, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
    
    model.compile(optimizer=optimizer, 
                 loss='binary_crossentropy', 
                 metrics=['accuracy', 
                         tf.keras.metrics.Precision(name='precision'),
                         tf.keras.metrics.Recall(name='recall')])
    return model

# LOAD AND PREPARE DATA
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

# INITIALIZE TOKENIZER AND TRANSFORMER
tokenizer = transformers.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
transformer_layer = transformers.TFDistilBertModel.from_pretrained('distilbert-base-uncased')

# ENCODE TEXT DATA
train_input, train_masks = bert_encode(train.text.values, tokenizer)
test_input, test_masks = bert_encode(test.text.values, tokenizer)
train_labels = train.target.values

# HANDLE CLASS IMBALANCE
class_weights = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}
print(f"CLASS WEIGHTS: {class_weight_dict}")

# SETUP CROSS-VALIDATION
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
test_preds = np.zeros(len(test))
oof_preds = np.zeros(len(train))
oof_labels = np.zeros(len(train))

# DEFINE TRAINING CALLBACKS
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-7, verbose=1)
]

# K-FOLD MODEL TRAINING
for fold, (train_idx, val_idx) in enumerate(skf.split(train_input, train_labels)):
    print(f"\n{'='*50}")
    print(f"FOLD {fold + 1}/{n_splits}")
    print(f"{'='*50}")
    
    tf.keras.backend.clear_session()
    model = build_advanced_model(transformer_layer)
    
    X_train, X_val = train_input[train_idx], train_input[val_idx]
    y_train, y_val = train_labels[train_idx], train_labels[val_idx]
    
    history = model.fit(
        X_train, 
        y_train,
        validation_data=(X_val, y_val),
        epochs=4,
        batch_size=16,
        callbacks=callbacks,
        class_weight=class_weight_dict,
        verbose=1
    )
    
    val_pred = model.predict(X_val, verbose=0).flatten()
    test_pred = model.predict(test_input, verbose=0).flatten()
    
    oof_preds[val_idx] = val_pred
    oof_labels[val_idx] = y_val
    test_preds += test_pred / n_splits
    
    val_pred_binary = (val_pred > 0.5).astype(int)
    fold_f1 = f1_score(y_val, val_pred_binary)
    fold_accuracy = np.mean(y_val == val_pred_binary)
    
    print(f"FOLD {fold + 1} RESULTS:")
    print(f"F1 SCORE: {fold_f1:.4f}")
    print(f"ACCURACY: {fold_accuracy:.4f}")
    print(f"VALIDATION LOSS: {history.history['val_loss'][-1]:.4f}")

# FINAL MODEL EVALUATION
final_preds_binary = (oof_preds > 0.5).astype(int)
final_f1 = f1_score(train_labels, final_preds_binary)
final_accuracy = np.mean(train_labels == final_preds_binary)

print(f"\n{'='*60}")
print("FINAL CROSS-VALIDATION RESULTS")
print(f"{'='*60}")
print(f"OVERALL OOF F1 SCORE: {final_f1:.4f}")
print(f"OVERALL OOF ACCURACY: {final_accuracy:.4f}")
print("\nCLASSIFICATION REPORT:")
print(classification_report(train_labels, final_preds_binary))

# CREATE SUBMISSION FILE
optimal_threshold = 0.5
submission['target'] = (test_preds > optimal_threshold).astype(int)
submission.to_csv('submission.csv', index=False)

print(f"\nSUBMISSION CREATED WITH THRESHOLD {optimal_threshold}")
print(f"POSITIVE PREDICTIONS: {submission['target'].sum()}/{len(submission)}")
print("FILE SAVED AS 'submission.csv'")

Class weights: {0: 0.8766697374481806, 1: 1.1637114032405993}

Fold 1/5
Train on 6089 samples, validate on 1524 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Epoch 00004: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Fold 1 Results:
F1 Score: 0.7800
Accuracy: 0.8327
Validation Loss: 0.4188

Fold 2/5
Train on 6090 samples, validate on 1523 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 00003: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 4/4

Epoch 00004: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-06.
Epoch 00004: early stopping
Fold 2 Results:
F1 Score: 0.8839
Accuracy: 0.9015
Validation Loss: 0.3133

Fold 3/5
Train on 6091 samples, validate on 1522 samples
Epoch 1/4
Epoch 2/4
Epoch 00002: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.
Epoch 3/4

Epoch 00003: ReduceLROnPlateau reducing learning rate to 4.999999873689376e-06.
Epoch 00003: early stopping
Fold 3 Results:
F1 Score: 0.9179
Accuracy: 0.932