In [2]:
import pandas as pd

# Load dataset
df = pd.read_csv("training.1600000.processed.noemoticon.csv", 
                 encoding="ISO-8859-1", 
                 names=["target", "id", "date", "flag", "user", "text"])

# Drop unnecessary columns
df = df[["target", "text"]]

# Convert sentiment labels (0: Negative, 4: Positive) -> (0: Negative, 1: Positive)
df["target"] = df["target"].map({0: 0, 4: 1})

print(df.head())


   target                                               text
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1       0  is upset that he can't update his Facebook by ...
2       0  @Kenichan I dived many times for the ball. Man...
3       0    my whole body feels itchy and like its on fire 
4       0  @nationwideclass no, it's not behaving at all....


In [3]:
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"@\w+|\#", "", text)  # Remove mentions and hashtags
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df["clean_text"] = df["text"].apply(preprocess_text)
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gupta\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


   target                                               text  \
0       0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1       0  is upset that he can't update his Facebook by ...   
2       0  @Kenichan I dived many times for the ball. Man...   
3       0    my whole body feels itchy and like its on fire    
4       0  @nationwideclass no, it's not behaving at all....   

                                          clean_text  
0      thats bummer shoulda got david carr third day  
1  upset cant update facebook texting might cry r...  
2  dived many times ball managed save rest go bounds  
3                   whole body feels itchy like fire  
4                           behaving im mad cant see  


In [7]:
from transformers import AutoTokenizer

MODEL_NAME = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["clean_text"], padding="max_length", truncation=True, max_length=128)

# Convert dataframe to Hugging Face Dataset
from datasets import Dataset

dataset = Dataset.from_pandas(df)
tokenized_datasets = dataset.map(tokenize_function, batched=True)
# Remove the old text column (optional)
tokenized_datasets = tokenized_datasets.remove_columns(["text", "clean_text"])

# Rename "target" to "labels" (required for Hugging Face Trainer)
tokenized_datasets = tokenized_datasets.rename_column("target", "labels")

# Split into train and validation sets
split_dataset = tokenized_datasets.train_test_split(test_size=0.1)


Map: 100%|██████████| 1600000/1600000 [01:17<00:00, 20646.88 examples/s]


In [8]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  # Reduce for CPU
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    fp16=False,  # No mixed precision for CPU
)

# Create Trainer with the correctly tokenized dataset
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
)

# Train the model
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained("sentiment140_model")
tokenizer.save_pretrained("sentiment140_tokenizer")


In [None]:
results = trainer.evaluate()
print(results)
