In [1]:
!pip install transformers datasets torch



In [2]:
import pandas as pd
import numpy as np

ds_url= "https://raw.githubusercontent.com/1ghofrane1/sentiment-analysis/main/test_emotion.csv"
df= pd.read_csv(ds_url, encoding='latin-1')
df.head()

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [3]:
df=df[['text', 'sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,Last session of the day http://twitpic.com/67ezh,neutral
1,Shanghai is also really exciting (precisely -...,positive
2,"Recession hit Veronique Branquinho, she has to...",negative
3,happy bday!,positive
4,http://twitpic.com/4w75p - I like it!!,positive


In [4]:
print("Sentiment distribution:")
#print(f"\ndataset size: {len(df)}")
print(df['sentiment'].value_counts())
print(df.isna().sum())
df.dropna(inplace=True)
print(df.isna().sum())
# Remove duplicates from the ENTIRE dataset before splitting
df = df.drop_duplicates(subset=["text"])  # Fixed: operate on df, not train_df
print(f"\nClean dataset size: {len(df)}")


Sentiment distribution:
sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64
text         1281
sentiment    1281
dtype: int64
text         0
sentiment    0
dtype: int64

Clean dataset size: 3534


In [5]:
from sklearn.model_selection import train_test_split

train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"\nTraining samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")


Training samples: 2827
Test samples: 354


In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments

# Choose a pre-trained model (we'll use DistilBERT - smaller but effective)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
from datasets import Dataset, ClassLabel

# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Convert sentiment labels to numerical format
class_labels = ClassLabel(names=list(df['sentiment'].unique()))

def preprocess_function(examples):
    # Ensure 'text' is a string (convert if necessary)
    examples["text"] = [str(text) for text in examples["text"]]  # Convert all values to string
    # Tokenize the text
    tokenized = tokenizer(examples["text"], padding="max_length", truncation=True)
    # Convert sentiment labels to numbers
    tokenized["label"] = [class_labels.str2int(s) for s in examples["sentiment"]]
    return tokenized

# Apply preprocessing
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Set format for PyTorch
tokenized_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/2827 [00:00<?, ? examples/s]

Map:   0%|          | 0/354 [00:00<?, ? examples/s]

Map:   0%|          | 0/353 [00:00<?, ? examples/s]

In [8]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

num_labels = len(df['sentiment'].unique())

# Create label mappings
id2label = {i: label for i, label in enumerate(df['sentiment'].unique())}
label2id = {label: i for i, label in enumerate(df['sentiment'].unique())}

# Load model with classification head
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
).to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./sentiment_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    fp16=True,
    warmup_steps=200,  # Added for better LR scheduling
    weight_decay=0.01,
    logging_steps=100,
    report_to="none",
    optim="adamw_torch_fused"
)

# Define metrics computation
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        "accuracy": accuracy,
        "f1_score": f1
    }

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    compute_metrics=compute_metrics,
)

# Start fine-tuning
print(f"Starting training on {device}...")
try:
    trainer.train()
except torch.cuda.OutOfMemoryError:
    print("CUDA OOM! Try reducing batch_size to 16")
    raise

# ============ PART 5: SAVE THE MODEL ============
# Save the fine-tuned model and tokenizer
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

print("Model saved to './sentiment_model' directory")

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training on cpu...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
!pip install gradio

In [None]:
import gradio as gr
from transformers import pipeline
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load your trained model and tokenizer
model_path = "./sentiment_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Create a sentiment analysis pipeline
sentiment_analyzer = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# Define emojis and colors for each sentiment
SENTIMENT_STYLES = {
    "positive": {"emoji": "😊", "color": "#4CAF50"},  # Green
    "negative": {"emoji": "😞", "color": "#F44336"},  # Red
    "neutral": {"emoji": "😐", "color": "#FFC107"},  # Yellow
    # Add more if your model has additional sentiment classes
}

def analyze_sentiment(text):
    # Get prediction from the model
    result = sentiment_analyzer(text)[0]
    sentiment = result['label'].lower()
    confidence = result['score']

    # Get the appropriate style for the detected sentiment
    style = SENTIMENT_STYLES.get(sentiment, {"emoji": "", "color": "#000000"})

    # Format the output with emoji and color
    output = f"""
    <div style='background-color: {style['color'] + '20'};
                padding: 15px;
                border-radius: 5px;
                border-left: 5px solid {style['color']};'>
        <span style='font-size: 24px;'>{style['emoji']}</span>
        <h3 style='color: {style['color']}; margin-top: 5px;'>This text is {sentiment}</h3>
        <p>Confidence: {confidence:.2%}</p>
    </div>
    """

    return output

def show_examples():
    examples = [
        "I love this product! It's amazing!",
        "The service was terrible and slow.",
        "The package arrived on time, nothing special.",
        "This is the best day of my life!",
        "I'm really disappointed with the quality."
    ]
    return examples

# Create the Gradio interface
with gr.Blocks(title="Sentiment Analysis", theme=gr.themes.Soft()) as demo:
    gr.Markdown("# Sentiment Analysis with Emojis 🎭")
    gr.Markdown("Enter some text to analyze its sentiment (positive, negative, or neutral)")

    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(label="Input Text", placeholder="Type something here...")
            analyze_btn = gr.Button("Analyze Sentiment", variant="primary")

            # Example section
            gr.Markdown("### Try these examples:")
            examples = gr.Examples(
                examples=show_examples(),
                inputs=[input_text],
                label="Click any example to analyze it"
            )

        with gr.Column():
            output_html = gr.HTML(label="Sentiment Analysis Result")

    analyze_btn.click(
        fn=analyze_sentiment,
        inputs=[input_text],
        outputs=[output_html]
    )

    # Add some CSS styling
    demo.css = """
    .gradio-container {
        max-width: 800px;
        margin: 0 auto;
    }
    .example-container {
        margin-top: 20px;
    }
    """

# Launch the interface
demo.launch()