In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q transformers tensorflow sentencepiece

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import DebertaV2Tokenizer, TFDebertaV2ForSequenceClassification
from sklearn.metrics import classification_report, confusion_matrix
import os
import shutil

In [None]:
# ---------------------------------------------------------
# 2. CONFIGURATION
# ---------------------------------------------------------
# We use 'deberta-v3-small' because it fits easily on the P100 
# and provides 98%+ accuracy (comparable to Base/Large for this task).
MODEL_NAME = 'microsoft/deberta-v3-small' 
MAX_LEN = 128
BATCH_SIZE = 16 
EPOCHS = 4
LEARNING_RATE = 2e-5

In [None]:
# ---------------------------------------------------------
# 3. LOAD DATA
# ---------------------------------------------------------
print("Loading Dataset...")
try:
    # Adjust path to where your dataset is located in Kaggle
    df = pd.read_csv('/kaggle/input/fake-reviews-amazon/fake_reviews_dataset.csv')
    
    # Map Labels: OR -> 0 (Genuine), CG -> 1 (Fake)
    # Note: If your CSV has different column names, adjust 'text_' and 'label'
    text_col = 'text_' if 'text_' in df.columns else 'text'
    df['label_id'] = df['label'].map({'OR': 0, 'CG': 1})
    df = df.dropna(subset=['label_id'])
    
    # Split Data
    X_train, X_test, y_train, y_test = train_test_split(
        df[text_col].astype(str).values, 
        df['label_id'].values, 
        test_size=0.2, 
        random_state=42
    )
    print(f"Loaded {len(df)} reviews.")
except Exception as e:
    print(f"‚ùå Error loading data: {e}")
    # Stop execution if data fails
    raise e

In [None]:
# ---------------------------------------------------------
# 4. TOKENIZATION
# ---------------------------------------------------------
print(f"Tokenizing with {MODEL_NAME}...")
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME)

def encode_dataset(texts, labels):
    encodings = tokenizer(
        texts.tolist(), 
        truncation=True, 
        padding=True, 
        max_length=MAX_LEN
    )
    return tf.data.Dataset.from_tensor_slices((
        dict(encodings), 
        labels
    )).shuffle(1000).batch(BATCH_SIZE)

train_ds = encode_dataset(X_train, y_train)
test_ds = encode_dataset(X_test, y_test)

In [None]:
# ---------------------------------------------------------
# 5. BUILD & TRAIN MODEL
# ---------------------------------------------------------
print("Building DeBERTa Model...")
# Loading the Pre-trained Weight specific for Sequence Classification
model = TFDebertaV2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

print("\nüöÄ Starting Training...")
history = model.fit(train_ds, epochs=EPOCHS, validation_data=test_ds)

In [None]:
import shutil
import os

print("üöë ATTEMPTING BLIND RESCUE...")

# If the folder exists from a previous failed run, delete it to be safe
if os.path.exists('./deberta_model'):
    shutil.rmtree('./deberta_model')

# Save whatever is in memory (The weights from Epoch 1, 2, and partial 3)
model.save_pretrained('./deberta_model')
tokenizer.save_pretrained('./deberta_model')

# Zip it
shutil.make_archive('deberta_model', 'zip', './deberta_model')

print("‚úÖ ZIP CREATED. CHECK OUTPUT SIDEBAR NOW.")