CNN Model

1.CNN-based model for text classification that uses your train.csv, test.csv, and dev.csv datasets. The code preprocesses the data, tokenizes the text, builds a CNN model using TensorFlow/Keras, trains the model, and generates predictions on the test set.

This script:

Loads the datasets from CSV files.

Tokenizes and pads the text data.

Builds a CNN model for text classification.

Trains the model using the training data.

Predicts labels for the test dataset and saves them.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.metrics import classification_report

# Load datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
dev_df = pd.read_csv('/content/dev.csv')

# Extract text and labels
train_texts = train_df['text'].astype(str).tolist()
train_labels = train_df['label'].values

dev_texts = dev_df['text'].astype(str).tolist()
dev_labels = dev_df['label'].values

test_texts = test_df['text'].astype(str).tolist()
test_ids = test_df['id']  # Extract test IDs

# Tokenization
max_words = 10000  # Vocabulary size
max_len = 100  # Max length of sequences
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Convert text to sequences
X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_len)
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev_texts), maxlen=max_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=max_len)

y_train = np.array(train_labels)
y_dev = np.array(dev_labels)

# Build CNN Model
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])

# Compile model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=5, batch_size=32)

# Predict on test data
test_predictions = model.predict(X_test)
test_predictions = (test_predictions > 0.5).astype(int)  # Convert probabilities to binary labels

# Save predictions (only 'id' and 'predicted_label')
output_df = pd.DataFrame({'id': test_ids, 'predicted_label': test_predictions.flatten()})
output_df.to_csv('/content/test_predictions_01.csv', index=False)
print("Predictions saved to test_predictions_01.csv")

# Classification report on dev set
y_dev_pred = (model.predict(X_dev) > 0.5).astype(int)
print("Classification Report on Dev Set:")
print(classification_report(y_dev, y_dev_pred))


Epoch 1/5




[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 50ms/step - accuracy: 0.6024 - loss: 0.6685 - val_accuracy: 0.6760 - val_loss: 0.6036
Epoch 2/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 45ms/step - accuracy: 0.8179 - loss: 0.4380 - val_accuracy: 0.7776 - val_loss: 0.4762
Epoch 3/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 56ms/step - accuracy: 0.9570 - loss: 0.1358 - val_accuracy: 0.7980 - val_loss: 0.5991
Epoch 4/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 67ms/step - accuracy: 0.9890 - loss: 0.0407 - val_accuracy: 0.7954 - val_loss: 0.6713
Epoch 5/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 40ms/step - accuracy: 0.9928 - loss: 0.0242 - val_accuracy: 0.7865 - val_loss: 0.7091
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
Predictions saved to test_predictions_01.csv
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
Clas

2.code to support multi-class classification and added a classification report for evaluation

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
dev_df = pd.read_csv('/content/dev.csv')

# Assuming the dataset has 'text' and 'label' columns
train_texts = train_df['text'].astype(str).tolist()
train_labels = train_df['label'].astype(str).tolist()
test_texts = test_df['text'].astype(str).tolist()
dev_texts = dev_df['text'].astype(str).tolist()
dev_labels = dev_df['label'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_dev = label_encoder.transform(dev_labels)
num_classes = len(label_encoder.classes_)

# Tokenization
max_words = 10000  # Vocabulary size
max_len = 100  # Max length of sequences
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_len)
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev_texts), maxlen=max_len)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=max_len)

# Build CNN Model
embedding_dim = 100
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Multi-class classification
])

# Compile model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=5, batch_size=32)

# Predict on test data
test_predictions = model.predict(X_test)
test_pred_labels = np.argmax(test_predictions, axis=1)  # Convert probabilities to class labels

# Save predictions
output_df = pd.DataFrame({'id': test_ids, 'predicted_label': label_encoder.inverse_transform(test_pred_labels)})
output_df.to_csv('/content/test_predictions_02.csv', index=False)
print("Predictions saved to test_predictions.csv")

# Classification report
y_dev_pred = np.argmax(model.predict(X_dev), axis=1)
print("Classification Report:")
print(classification_report(y_dev, y_dev_pred, target_names=label_encoder.classes_))


Epoch 1/5




[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 52ms/step - accuracy: 0.6225 - loss: 0.6600 - val_accuracy: 0.7078 - val_loss: 0.5837
Epoch 2/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 46ms/step - accuracy: 0.8320 - loss: 0.4029 - val_accuracy: 0.7967 - val_loss: 0.4531
Epoch 3/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 45ms/step - accuracy: 0.9715 - loss: 0.1059 - val_accuracy: 0.7992 - val_loss: 0.5632
Epoch 4/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 51ms/step - accuracy: 0.9907 - loss: 0.0389 - val_accuracy: 0.8030 - val_loss: 0.6478
Epoch 5/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 50ms/step - accuracy: 0.9924 - loss: 0.0238 - val_accuracy: 0.7992 - val_loss: 0.7054
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Predictions saved to test_predictions.csv
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
Classifi

3.the code to use a BERT model for multi-class text classification. It now includes BERT tokenization, model training, and a classification report

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.utils import to_categorical

# Load datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
dev_df = pd.read_csv('/content/dev.csv')

# Assuming dataset has 'text' and 'label' columns
train_texts = train_df['text'].astype(str).tolist()
train_labels = train_df['label'].astype(str).tolist()
test_texts = test_df['text'].astype(str).tolist()
dev_texts = dev_df['text'].astype(str).tolist()
dev_labels = dev_df['label'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_dev = label_encoder.transform(dev_labels)
num_classes = len(label_encoder.classes_)

y_train = to_categorical(y_train, num_classes)
y_dev = to_categorical(y_dev, num_classes)

# Tokenization and Padding
max_words = 10000  # Maximum vocabulary size
max_len = 100  # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

X_train = pad_sequences(tokenizer.texts_to_sequences(train_texts), maxlen=max_len, padding='post', truncating='post')
X_dev = pad_sequences(tokenizer.texts_to_sequences(dev_texts), maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_texts), maxlen=max_len, padding='post', truncating='post')

# CNN Model for Text Classification
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train Model
model.fit(X_train, y_train, validation_data=(X_dev, y_dev), epochs=5, batch_size=32)

# Predict on test data
test_predictions = model.predict(X_test)
test_pred_labels = np.argmax(test_predictions, axis=1)

# Save Predictions
output_df = pd.DataFrame({'id': test_ids, 'predicted_label': label_encoder.inverse_transform(test_pred_labels)})
output_df.to_csv('/content/test_predictions_03.csv', index=False)
print("Predictions saved to test_predictions.csv")

# Classification Report
y_dev_pred = np.argmax(model.predict(X_dev), axis=1)
y_dev_true = np.argmax(y_dev, axis=1)
print("Classification Report:")
print(classification_report(y_dev_true, y_dev_pred, target_names=label_encoder.classes_))


Epoch 1/5




[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 70ms/step - accuracy: 0.6157 - loss: 0.6683 - val_accuracy: 0.7116 - val_loss: 0.6214
Epoch 2/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 63ms/step - accuracy: 0.8088 - loss: 0.4448 - val_accuracy: 0.7814 - val_loss: 0.4763
Epoch 3/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 66ms/step - accuracy: 0.9720 - loss: 0.1192 - val_accuracy: 0.7738 - val_loss: 0.5826
Epoch 4/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 65ms/step - accuracy: 0.9900 - loss: 0.0348 - val_accuracy: 0.7954 - val_loss: 0.7101
Epoch 5/5
[1m173/173[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 67ms/step - accuracy: 0.9931 - loss: 0.0205 - val_accuracy: 0.7942 - val_loss: 0.8219
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step
Predictions saved to test_predictions.csv
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step
Class

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

# Load datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
dev_df = pd.read_csv('/content/dev.csv')

# Assuming dataset has 'text' and 'label' columns
train_texts = train_df['text'].astype(str).tolist()
train_labels = train_df['label'].astype(str).tolist()
test_texts = test_df['text'].astype(str).tolist()
dev_texts = dev_df['text'].astype(str).tolist()
dev_labels = dev_df['label'].astype(str).tolist()

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_dev = label_encoder.transform(dev_labels)
num_classes = len(label_encoder.classes_)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(train_texts)
X_dev_tfidf = vectorizer.transform(dev_texts)
X_test_tfidf = vectorizer.transform(test_texts)

# SVM Classifier (using a linear kernel)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_tfidf, y_train)

# Predict on dev and test data
dev_predictions = svm_model.predict(X_dev_tfidf)
test_predictions = svm_model.predict(X_test_tfidf)

# Save Predictions (Test Data)
output_df = pd.DataFrame({'id': test_df['id'], 'predicted_label': label_encoder.inverse_transform(test_predictions)})
output_df.to_csv('/content/test_predictions_svm.csv', index=False)
print("Predictions saved to test_predictions_svm.csv")

# Classification Report
print("Classification Report on Dev Data:")
print(classification_report(y_dev, dev_predictions, target_names=label_encoder.classes_))


Predictions saved to test_predictions_svm.csv
Classification Report on Dev Data:
              precision    recall  f1-score   support

           0       0.72      0.86      0.78       485
           1       0.67      0.48      0.56       302

    accuracy                           0.71       787
   macro avg       0.70      0.67      0.67       787
weighted avg       0.70      0.71      0.70       787



In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, create_optimizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping

# Load datasets
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')
dev_df = pd.read_csv('/content/dev.csv')

# Extract text and labels
train_texts = train_df['text'].astype(str).tolist()
train_labels = train_df['label'].astype(str).tolist()
test_texts = test_df['text'].astype(str).tolist()
dev_texts = dev_df['text'].astype(str).tolist()
dev_labels = dev_df['label'].astype(str).tolist()

# Encode labels (integer encoding)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_dev = label_encoder.transform(dev_labels)
num_classes = len(label_encoder.classes_)

# Tokenizer and model
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)

# Tokenize input texts
def tokenize_data(texts, tokenizer, max_len=128):
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_len,
        return_tensors='tf'
    )

train_encodings = tokenize_data(train_texts, tokenizer)
dev_encodings = tokenize_data(dev_texts, tokenizer)
test_encodings = tokenize_data(test_texts, tokenizer)

# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    y_train
)).shuffle(1000).batch(32)

dev_dataset = tf.data.Dataset.from_tensor_slices((
    dict(dev_encodings),
    y_dev
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices(dict(test_encodings)).batch(32)

# Load model
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)

# Optimizer setup
epochs = 3
batch_size = 32
train_steps = len(train_dataset) * epochs
optimizer, _ = create_optimizer(init_lr=2e-5, num_train_steps=train_steps, num_warmup_steps=0)

# Compile model
model.compile(optimizer=optimizer, metrics=['accuracy'])

# Train model
# Train model
model.fit(
    train_dataset,
    validation_data=dev_dataset,
    epochs=3  # You can reduce or increase as needed
)


# Predict on dev set
dev_preds = model.predict(dev_dataset).logits
dev_pred_labels = np.argmax(dev_preds, axis=1)

# Evaluate
print("Classification Report (Dev):")
print(classification_report(y_dev, dev_pred_labels, target_names=label_encoder.classes_))

# Predict on test set
test_preds = model.predict(test_dataset).logits
test_pred_labels = np.argmax(test_preds, axis=1)
test_labels_str = label_encoder.inverse_transform(test_pred_labels)

# Save test predictions
output_df = pd.DataFrame({
    'id': test_df['id'],
    'predicted_label': test_labels_str
})
output_df.to_csv('/content/test_predictions_mbert.csv', index=False)
print("Predictions saved to test_predictions_mbert.csv")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3
Classification Report (Dev):
              precision    recall  f1-score   support

           0       0.72      0.89      0.79       485
           1       0.71      0.44      0.54       302

    accuracy                           0.72       787
   macro avg       0.71      0.66      0.67       787
weighted avg       0.71      0.72      0.70       787

Predictions saved to test_predictions_mbert.csv
