In [22]:
import pandas as pd

# Load the dataset
splits = {'train': 'train.jsonl', 'test': 'test.jsonl'}
train_df = pd.read_json("hf://datasets/SetFit/tweet_sentiment_extraction/" + splits["train"], lines=True)

# Display the first few rows of the dataset
print(train_df.head())


       textID                                               text  label  \
0  cb774db0d1                I`d have responded, if I were going      1   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!      0   
2  088c60f138                          my boss is bullying me...      0   
3  9642c003ef                     what interview! leave me alone      0   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...      0   

  label_text  
0    neutral  
1   negative  
2   negative  
3   negative  
4   negative  


In [23]:
# Check for missing values
null_values = train_df.isnull().sum()
print("Null values in each column:\n", null_values)


Null values in each column:
 textID        0
text          0
label         0
label_text    0
dtype: int64


In [24]:
# Check for duplicates
duplicates = train_df.duplicated().sum()
print("Number of duplicate rows: ", duplicates)

# Remove duplicates
train_df = train_df.drop_duplicates()


Number of duplicate rows:  0


In [25]:
from sklearn.preprocessing import LabelEncoder

# If 'label' is not already encoded
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['label_text'])

# Show encoded labels
print(train_df[['label_text', 'label']].head())


  label_text  label
0    neutral      1
1   negative      0
2   negative      0
3   negative      0
4   negative      0


In [31]:
from transformers import AutoTokenizer

# Load the tokenizer for DistilBERT
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text
train_df['input_ids'] = train_df['text'].apply(lambda x: tokenizer.encode(x, truncation=True, padding='max_length', max_length=128))

# Display the first few tokenized inputs
print(train_df[['text', 'input_ids']].head())


                                                text  \
0                I`d have responded, if I were going   
1      Sooo SAD I will miss you here in San Diego!!!   
2                          my boss is bullying me...   
3                     what interview! leave me alone   
4   Sons of ****, why couldn`t they put them on t...   

                                           input_ids  
0  [101, 1045, 1036, 1040, 2031, 5838, 1010, 2065...  
1  [101, 17111, 2080, 6517, 1045, 2097, 3335, 201...  
2  [101, 2026, 5795, 2003, 18917, 2033, 1012, 101...  
3  [101, 2054, 4357, 999, 2681, 2033, 2894, 102, ...  
4  [101, 4124, 1997, 1008, 1008, 1008, 1008, 1010...  


In [32]:
import tensorflow as tf

# Convert the tokenized input_ids and labels to TensorFlow tensors
input_ids = tf.constant(train_df['input_ids'].tolist())
labels = tf.constant(train_df['label'].tolist())

# Create a TensorFlow dataset
train_dataset = tf.data.Dataset.from_tensor_slices((input_ids, labels))

# Shuffle and batch the dataset
train_dataset = train_dataset.shuffle(buffer_size=len(train_df)).batch(32)


In [33]:
from transformers import TFDistilBertForSequenceClassification

# Load the DistilBERT model for sequence classification
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model
model.fit(train_dataset, epochs=3)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/3
  7/859 [..............................] - ETA: 5:31:22 - loss: 1.0986 - accuracy: 0.3884

KeyboardInterrupt: 