## Import Library

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import torch

# Download necessary NLTK resources
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Read and understand data 

In [15]:
file_train_path = '/home/pc/Desktop_linux/chinu/vidya_analytics_hacathon/sentiment_analysis/train_2kmZucJ.csv'
file_unseen_path = '/home/pc/Desktop_linux/chinu/vidya_analytics_hacathon/sentiment_analysis/test_oJQbWVk.csv'
data = pd.read_csv(file_train_path)
unseen_df = pd.read_csv(file_unseen_path)

data.head(), data.shape


(   id  label                                              tweet
 0   1      0  #fingerprint #Pregnancy Test https://goo.gl/h1...
 1   2      0  Finally a transparant silicon case ^^ Thanks t...
 2   3      0  We love this! Would you go? #talk #makememorie...
 3   4      0  I'm wired I know I'm George I was made that wa...
 4   5      1  What amazing service! Apple won't even talk to...,
 (7920, 3))

## Preprocessing

In [16]:
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['cleaned_text'] = data['tweet'].apply(preprocess_text)
unseen_df['cleaned_text'] = unseen_df['tweet'].apply(preprocess_text)
data

Unnamed: 0,id,label,tweet,cleaned_text
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...,fingerprint pregnancy test android apps beauti...
1,2,0,Finally a transparant silicon case ^^ Thanks t...,finally transparant silicon case thanks uncle ...
2,3,0,We love this! Would you go? #talk #makememorie...,love would go talk makememories unplug relax i...
3,4,0,I'm wired I know I'm George I was made that wa...,im wired know im george made way iphone cute d...
4,5,1,What amazing service! Apple won't even talk to...,amazing service apple wont even talk question ...
...,...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...,live loud lol liveoutloud selfie smile sony mu...
7916,7917,0,We would like to wish you an amazing day! Make...,would like wish amazing day make every minute ...
7917,7918,0,Helping my lovely 90 year old neighbor with he...,helping lovely 90 year old neighbor ipad morni...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...,finally got smart pocket wifi stay connected a...


## Train test split

In [17]:
# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], test_size=0.2, random_state=42)

## Tokenization and embedding

In [None]:
# 3. Tokenization and Encoding (Using DistilBERT - a cost-efficient transformer)
model_name = "distilbert-base-uncased" # or "roberta-base" if you have more compute
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
unseen_encodings = tokenizer(unseen_df['cleaned_text'].tolist(), truncation=True, padding=True)

{'input_ids': [[101, 4658, 2482, 9378, 2801, 1996, 2483, 3122, 2924, 14854, 8524, 24335, 29067, 2100, 10957, 10957, 11442, 4710, 25157, 4140, 27125, 26887, 2431, 22591, 13469, 5070, 11968, 3207, 2860, 4913, 5297, 2258, 18656, 19102, 18059, 28205, 2015, 2482, 13154, 1048, 2290, 18798, 16059, 4202, 26760, 10128, 2102, 2600, 2316, 13272, 5152, 2974, 6786, 24330, 2102, 9148, 12079, 9006, 4160, 4160, 2692, 2581, 3723, 15378, 2595, 2015, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 6302, 20198, 5798, 8412, 3328, 2386, 6343, 10299, 15536, 2480, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 25249, 2015, 5221, 8632, 8239, 4774, 2215, 5466, 8239, 2518, 8239, 2813, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

## Creating dataset

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.tolist()[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train.reset_index(drop=True))
test_dataset = SentimentDataset(test_encodings, y_test.reset_index(drop=True))
# unseen_dataset = SentimentDataset(unseen_encodings, y_test.reset_index(drop=True))

## Model Building

In [20]:
# 4. Model Training (DistilBERT)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 labels: positive, negative, neutral

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Adjust epochs as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels,preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2118,0.256623,0.891414,0.895096,0.909981,0.891414
2,0.1417,0.307426,0.896465,0.896759,0.897107,0.896465
3,0.1008,0.394461,0.897727,0.899024,0.901496,0.897727


TrainOutput(global_step=2376, training_loss=0.251212659037926, metrics={'train_runtime': 137.5084, 'train_samples_per_second': 138.232, 'train_steps_per_second': 17.279, 'total_flos': 383599310485248.0, 'train_loss': 0.251212659037926, 'epoch': 3.0})

In [21]:
# 5. Model Evaluation
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print("Evaluation Metrics:")
print(compute_metrics(predictions))

Evaluation Metrics:
{'accuracy': 0.8977272727272727, 'f1': 0.8990240379816941, 'precision': 0.9014963416201048, 'recall': 0.8977272727272727}


## Inference

In [None]:
# 6. Inference (Example)
def predict_sentiment(text, model, tokenizer, device): #add device parameter
    cleaned_text = preprocess_text(text)
    inputs = tokenizer(cleaned_text, truncation=True, padding=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()} #move input to device
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
    return predicted_class

# Get the device the model is on
device = next(model.parameters()).device

# example_text = "This is a great example."
# predicted_sentiment = predict_sentiment(example_text, model, tokenizer, device) #pass device to function
# print(f"Predicted sentiment for '{example_text}': {predicted_sentiment}")

# example_text2 = "This is the worst example."
# predicted_sentiment2 = predict_sentiment(example_text2, model, tokenizer, device)
# print(f"Predicted sentiment for '{example_text2}': {predicted_sentiment2}")


Predicted sentiment for 'This is a great example.': 0
Predicted sentiment for 'This is the worst example.': 1


In [None]:
unseen_df['predictions'] = unseen_df['tweet'].apply(lambda x: predict_sentiment(x, model, tokenizer, device))
unseen_df.head(), unseen_df['predictions'].sum(), unseen_df.shape


## Submission


In [33]:
submission_df = unseen_df[['id', 'predictions']]
submission_df = submission_df.rename(columns={'predictions': 'label'})
file_submission_path = '/home/pc/Desktop_linux/chinu/vidya_analytics_hacathon/sentiment_analysis/submission_1.csv'
submission_df.to_csv(file_submission_path, index=False)

## RoBERTa (Robustly Optimized BERT Pretraining Approach)

In [35]:
file_train_path = '/home/pc/Desktop_linux/chinu/vidya_analytics_hacathon/sentiment_analysis/train_2kmZucJ.csv'
file_unseen_path = '/home/pc/Desktop_linux/chinu/vidya_analytics_hacathon/sentiment_analysis/test_oJQbWVk.csv'
data = pd.read_csv(file_train_path)
unseen_df = pd.read_csv(file_unseen_path)

def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = text.lower()
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

data['cleaned_text'] = data['tweet'].apply(preprocess_text)
unseen_df['cleaned_text'] = unseen_df['tweet'].apply(preprocess_text)

# 2. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], test_size=0.2, random_state=42)


In [36]:
# 3. Tokenization and Encoding (Using DistilBERT - a cost-efficient transformer)
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [37]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True)
unseen_encodings = tokenizer(unseen_df['cleaned_text'].tolist(), truncation=True, padding=True)

In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.tolist()[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train.reset_index(drop=True))
test_dataset = SentimentDataset(test_encodings, y_test.reset_index(drop=True))

# 4. Model Training (DistilBERT)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3) # 3 labels: positive, negative, neutral

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,  # Adjust epochs as needed
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels,preds, average='weighted')
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2798,0.540416,0.855429,0.845317,0.854963,0.855429
2,0.3027,0.336981,0.89899,0.899348,0.899789,0.89899
3,0.2743,0.295646,0.902146,0.903049,0.904556,0.902146


TrainOutput(global_step=2376, training_loss=0.3564734248701571, metrics={'train_runtime': 272.2569, 'train_samples_per_second': 69.816, 'train_steps_per_second': 8.727, 'total_flos': 800983021918464.0, 'train_loss': 0.3564734248701571, 'epoch': 3.0})

In [40]:
# 5. Model Evaluation
predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print("Evaluation Metrics:")
print(compute_metrics(predictions))

# 6. Inference (Example)
def predict_sentiment(text, model, tokenizer, device): #add device parameter
    cleaned_text = preprocess_text(text)
    inputs = tokenizer(cleaned_text, truncation=True, padding=True, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()} #move input to device
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
    return predicted_class

# Get the device the model is on
device = next(model.parameters()).device

unseen_df['predictions'] = unseen_df['tweet'].apply(lambda x: predict_sentiment(x, model, tokenizer, device))

submission_df = unseen_df[['id', 'predictions']]
submission_df = submission_df.rename(columns={'predictions': 'label'})
file_submission_path = '/home/pc/Desktop_linux/chinu/vidya_analytics_hacathon/sentiment_analysis/submission_2.csv'
submission_df.to_csv(file_submission_path, index=False)

Evaluation Metrics:
{'accuracy': 0.9021464646464646, 'f1': 0.903048934669488, 'precision': 0.9045561497326203, 'recall': 0.9021464646464646}
