In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/meme-dataset/val_with_reason.csv
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/9273.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/9292.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/4353.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/6262.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/2664.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/9110.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/1231.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/1017.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/7530.png
/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Image_dataset/A_train_img/No Hate/85

In [2]:
# Section 1: Install and Import Libraries
!pip install --upgrade datasets transformers
!pip install nltk emoji

import numpy as np
import pandas as pd
import torch
import random
import os
import json
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    pipeline
)
from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import warnings
import zipfile
import re
import emoji
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
warnings.filterwarnings("ignore")
print("Libraries imported successfully!")

# Section 2: Set Random Seed for Reproducibility
def set_seed(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

seed = 123
set_seed(seed)
print(f"Random seed set to {seed} for reproducibility.")

# Section 3: Importing the Datasets
df_train = pd.read_csv('/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Text_dataset/A_train.csv')
df_val = pd.read_csv('/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Text_dataset/A_val.csv')
df_test = pd.read_csv('/kaggle/input/meme-dataset/meme_dataset/meme_dataset/Text_dataset/A_test.csv')

print("Training Dataset:")
print(df_train.head())
print("\nVal Dataset:")
print(df_val.head())
print("\nTest Dataset:")
print(df_test.head())

# Section 4: Checking the Hate and Non-Hate Ratio
class_counts = df_train['label'].value_counts()
print("Class distribution in the training set:")
print(f"No Hate (0): {class_counts.get(0, 0)} samples")
print(f"Hate (1): {class_counts.get(1, 0)} samples")
print(f"Percentage No Hate (0): {(class_counts.get(0, 0) / len(df_train)) * 100:.2f}%")
print(f"Percentage Hate (1): {(class_counts.get(1, 0) / len(df_train)) * 100:.2f}%")

# Section 5: Text Cleaning (No Stopwords Removal, Less Aggressive)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"'re", " are", text)
    text = re.sub(r"'s", " is", text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = emoji.replace_emoji(text, replace='')
    text = re.sub(r'(.)\\1{3,}', r'\\1\\1\\1', text)
    text = ''.join(c for c in text if c.isalnum() or c.isspace())
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_val['text'] = df_val['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

print("\nTraining dataset after cleaning:")
print(df_train.head())
print("\nVal dataset after cleaning:")
print(df_val.head())
print("\nTest dataset after cleaning:")
print(df_test.head())

# Section 6: Tokenization and Fine-Tuning with DistilBERT
def initialize_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    model.config.hidden_dropout_prob = 0.2
    model.config.attention_probs_dropout_prob = 0.2
    return tokenizer, model

selected_model = 'distilbert-base-uncased'
tokenizer, model = initialize_model(selected_model)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

train_data, val_data = train_test_split(df_train, test_size=0.2, stratify=df_train['label'], random_state=seed)
dataset_train = Dataset.from_pandas(train_data[['text', 'label']])
dataset_val_internal = Dataset.from_pandas(val_data[['text', 'label']])
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_val_internal = dataset_val_internal.map(tokenize_function, batched=True)
tokenized_train.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
tokenized_val_internal.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

dataset_test = Dataset.from_pandas(df_test[['text']])
tokenized_test = dataset_test.map(tokenize_function, batched=True)
tokenized_test.set_format('torch', columns=['input_ids', 'attention_mask'])

# Section 7: Train the Model
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds),
        'precision': precision_score(labels, preds),
        'recall': recall_score(labels, preds)
    }

batch_size = 16
total_steps = (len(dataset_train) // batch_size) * 10
warmup_steps = int(0.1 * total_steps)
print(f"Total steps: {total_steps}, Warmup steps: {warmup_steps}")

training_args = TrainingArguments(
    output_dir=f'./results/{selected_model}_seed{seed}',
    report_to="none",
    num_train_epochs=10,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    warmup_steps=warmup_steps,
    weight_decay=0.01,
    learning_rate=1e-5,
    logging_dir=f'./logs/seed{seed}',
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val_internal,
    compute_metrics=compute_metrics,
)

trainer.train()
eval_results = trainer.evaluate()
print(f"\nEvaluation Results on Internal Validation Set (Seed {seed}, Model: {selected_model}):")
print(eval_results)

# Section 8: Prediction and JSON Format Conversion
predictions = trainer.predict(tokenized_test)
pred_labels = np.argmax(predictions.predictions, axis=1)

test_predictions = [{"index": str(idx), "prediction": int(pred)} for idx, pred in zip(df_test['index'], pred_labels)]
print("\nTest Predictions (First 5):")
for pred in test_predictions[:5]:
    print(pred)

with open('submission.json', 'w') as f:
    for pred in test_predictions:
        f.write(f'{{"index": "{pred["index"]}", "prediction": {pred["prediction"]}}}\n')

print("\nTest predictions saved to 'submission.json'")

with zipfile.ZipFile('ref.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
    zipf.write('submission.json')

print("\nZip file 'ref.zip' created with submission.json")

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.53.3-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-4.0.0-py3-none-any.whl (494 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m31.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading transformers-4.53.3-py3-none-any.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m116.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.8 MB/s[0m eta [36m0:00:0

2025-07-23 16:06:18.544796: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753286778.714682      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753286778.764061      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Libraries imported successfully!
Random seed set to 123 for reproducibility.
Training Dataset:
      index                                               text  label
0  1001.png  transgirls who grow boobs but keep their cock ...      0
1  1005.png  realistic and wholesome representation of tran...      0
2  1008.png  united lgbt united sponsored you can color you...      0
3  1009.png  neolib politician i'm going to fuck my cat and...      1
4  1010.png  dad i'm gay i love you no matter what i also s...      1

Val Dataset:
      index                                               text  label
0  1003.png  soon available on every women's bathroom say t...      0
1  1024.png  sides gay fries fries but they're g ya know 6 ...      0
2  1045.png  me doesn't want to talk about dating sex with ...      1
3  1046.png  ferb trans rights says aren't you a little you...      0
4  1080.png  bethesda tr ethesda tr modle east bethesda rus...      1

Test Dataset:
      index                         

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3240 [00:00<?, ? examples/s]

Map:   0%|          | 0/810 [00:00<?, ? examples/s]

Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Total steps: 2020, Warmup steps: 202


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6629,0.642939,0.648148,0.691224,0.606464,0.803526
2,0.5617,0.568315,0.701235,0.673854,0.724638,0.629723
3,0.4909,0.562385,0.714815,0.710163,0.7075,0.712846
4,0.3752,0.571751,0.744444,0.741573,0.735149,0.748111
5,0.3205,0.649728,0.735802,0.737101,0.719424,0.755668
6,0.2222,0.743447,0.720988,0.692098,0.753709,0.639798
7,0.2047,0.866572,0.717284,0.712673,0.71,0.715365
8,0.1109,0.934047,0.712346,0.703185,0.71134,0.695214
9,0.1296,1.010333,0.711111,0.710396,0.698297,0.722922
10,0.1006,1.034312,0.706173,0.695652,0.706494,0.685139



Evaluation Results on Internal Validation Set (Seed 123, Model: distilbert-base-uncased):
{'eval_loss': 0.562384843826294, 'eval_accuracy': 0.7148148148148148, 'eval_f1': 0.7101631116687579, 'eval_precision': 0.7075, 'eval_recall': 0.7128463476070529, 'eval_runtime': 1.4806, 'eval_samples_per_second': 547.093, 'eval_steps_per_second': 34.447, 'epoch': 10.0}

Test Predictions (First 5):
{'index': '1002.png', 'prediction': 1}
{'index': '1011.png', 'prediction': 1}
{'index': '1040.png', 'prediction': 0}
{'index': '1055.png', 'prediction': 1}
{'index': '1115.png', 'prediction': 0}

Test predictions saved to 'submission.json'

Zip file 'ref.zip' created with submission.json
