## 1) Setup & Requirements

In [1]:
!pip install -q numpy pandas scikit-learn matplotlib seaborn nltk tqdm tensorflow torch transformers datasets evaluate sentencepiece streamlit wandb

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m8.0/10.1 MB[0m [31m241.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.1/10.1 MB[0m [31m245.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m140.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.9 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.9/6.9 MB[0m [31m217.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━

## 2) Dataset Loading and Cleaning

In [2]:
import pandas as pd
from pathlib import Path

train_fp = Path("/content/twitter_training.csv")
valid_fp = Path("/content/twitter_validation.csv")

EXPECTED_COLS = ['Tweet ID','entity','sentiment','Tweet content']

def load_with_header_detection(fp, expected_cols=EXPECTED_COLS):
    df = pd.read_csv(fp)
    cols_lower = [str(c).strip().lower() for c in df.columns]
    if any(ec.lower() in " ".join(cols_lower) for ec in expected_cols):
        print(f"{fp} appears to have a valid header. Columns: {df.columns.tolist()}")
        return df
    print(f"{fp} DOES NOT look like it has proper headers. Re-reading with header=None and assigning expected columns.")
    df = pd.read_csv(fp, header=None, names=expected_cols)
    return df

train_df = load_with_header_detection(train_fp)
valid_df = load_with_header_detection(valid_fp)

print("Train columns:", train_df.columns.tolist())
print("Valid columns:", valid_df.columns.tolist())
display(train_df.head())
display(valid_df.head())


/content/twitter_training.csv DOES NOT look like it has proper headers. Re-reading with header=None and assigning expected columns.
/content/twitter_validation.csv DOES NOT look like it has proper headers. Re-reading with header=None and assigning expected columns.
Train columns: ['Tweet ID', 'entity', 'sentiment', 'Tweet content']
Valid columns: ['Tweet ID', 'entity', 'sentiment', 'Tweet content']


Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


Unnamed: 0,Tweet ID,entity,sentiment,Tweet content
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [3]:
train_df.duplicated().sum()

np.int64(2700)

In [4]:
train_df.drop_duplicates(inplace=True)

In [5]:
train_df.isna().sum()

Unnamed: 0,0
Tweet ID,0
entity,0
sentiment,0
Tweet content,326


In [6]:
train_df.dropna(inplace=True)

In [7]:
valid_df.duplicated().sum()

np.int64(0)

In [8]:
valid_df.isna().sum()

Unnamed: 0,0
Tweet ID,0
entity,0
sentiment,0
Tweet content,0


In [9]:
train_df.drop(columns=['Tweet ID'], inplace=True)
valid_df.drop(columns=['Tweet ID'], inplace=True)

In [10]:
display(train_df.head())
display(valid_df.head())

Unnamed: 0,entity,sentiment,Tweet content
0,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,Borderlands,Positive,I am coming to the borders and I will kill you...
2,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,Borderlands,Positive,im coming on borderlands and i will murder you...
4,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


Unnamed: 0,entity,sentiment,Tweet content
0,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,Google,Neutral,Now the President is slapping Americans in the...


## 3) Preprocessing the Text & Merging the entity and content together

In [11]:
import re
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = [w for w in text.split() if w not in STOPWORDS]
    return " ".join(tokens)

if train_fp.exists() and valid_fp.exists():
    train_df['text'] = train_df['entity'].astype(str).str.strip() + " " + train_df['Tweet content'].astype(str)
    valid_df['text'] = valid_df['entity'].astype(str).str.strip() + " " + valid_df['Tweet content'].astype(str)
    train_df['text_clean'] = train_df['text'].apply(clean_text)
    valid_df['text_clean'] = valid_df['text'].apply(clean_text)
    label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
    def map_label(x):
        if pd.isna(x): return -1
        s = str(x).strip().lower()
        if s.startswith('neg'): return 0
        if s.startswith('neu'): return 1
        if s.startswith('pos'): return 2
        return label_map.get(x, -1)
    train_df['label'] = train_df['sentiment'].apply(map_label)
    valid_df['label'] = valid_df['sentiment'].apply(map_label)
    print("Label distribution (train):")
    display(train_df['label'].value_counts())
    display(train_df[['entity','Tweet content','text_clean','label']].head(5))
else:
    print("Dataset files missing — upload train/train.csv and valid/valid.csv.")


Label distribution (train):


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,21698
2,19713
1,17708
-1,12537


Unnamed: 0,entity,Tweet content,text_clean,label
0,Borderlands,im getting on borderlands and i will murder yo...,borderlands im getting borderlands murder,2
1,Borderlands,I am coming to the borders and I will kill you...,borderlands coming borders kill,2
2,Borderlands,im getting on borderlands and i will kill you ...,borderlands im getting borderlands kill,2
3,Borderlands,im coming on borderlands and i will murder you...,borderlands im coming borderlands murder,2
4,Borderlands,im getting on borderlands 2 and i will murder ...,borderlands im getting borderlands 2 murder,2


In [12]:
train_df = train_df[train_df['label'].isin([0,1,2])]
valid_df = valid_df[valid_df['label'].isin([0,1,2])]


## 4) Tokenizer + LSTM Model Training

In [13]:
# LSTM training
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout , Bidirectional ,BatchNormalization ,LayerNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

import tensorflow as tf
import pickle
X_train_text = train_df['text_clean'].astype(str).tolist()
X_val_text = valid_df['text_clean'].astype(str).tolist()
y_train = train_df['label'].values
y_val = valid_df['label'].values

MAX_WORDS = 20000
MAX_LEN = 120
tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(X_train_text)
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq = tokenizer.texts_to_sequences(X_val_text)
X_train = pad_sequences(X_train_seq, maxlen=MAX_LEN, padding='post', truncating='post')
X_val = pad_sequences(X_val_seq, maxlen=MAX_LEN, padding='post', truncating='post')

vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)
print("Unique labels:", np.unique(y_train))
print(train_df['label'].value_counts())


Unique labels: [0 1 2]
label
0    21698
2    19713
1    17708
Name: count, dtype: int64


In [14]:



num_classes = len(np.unique(y_train))
print("Num classes detected:", num_classes)
assert num_classes == 3, f"Expected 3 classes but got {num_classes}"

y_train = y_train.astype('int32')
y_val = y_val.astype('int32')
assert y_train.min() >= 0 and y_train.max() < num_classes

MAX_WORDS = 20000
MAX_LEN = 120
vocab_size = min(MAX_WORDS, len(tokenizer.word_index) + 1)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=MAX_LEN, name="embedding"))

model.add(Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=l2(1e-5),
                            recurrent_regularizer=l2(1e-5)), name="bilstm_1"))
model.add(LayerNormalization(name="ln_1"))
model.add(Dropout(0.3, name="drop_1"))

model.add(Bidirectional(LSTM(64, kernel_regularizer=l2(1e-5), recurrent_regularizer=l2(1e-5)), name="bilstm_2"))
model.add(LayerNormalization(name="ln_2"))
model.add(Dropout(0.3, name="drop_2"))

model.add(Dense(64, activation='relu', kernel_regularizer=l2(1e-5), name="dense_1"))
model.add(Dropout(0.25, name="drop_3"))
model.add(Dense(num_classes, activation='softmax', name="output"))

opt = Adam(learning_rate=1e-4, clipnorm=1.0)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

callbacks = [
    EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True),
    ModelCheckpoint(filepath = "lstm_best.h5", save_best_only=True, monitor='val_loss'),
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)



with open('tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer, f)
with open('label_map.pkl','wb') as f:
    pickle.dump({'id2label':{0:'Negative',1:'Neutral',2:'Positive'}}, f)



In [15]:
model.save('lstm_best.h5')

## 5) RoBERTa Model Training


In [16]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import numpy as np
import json
import torch

if 'train_df' in globals() and 'valid_df' in globals():
    train_df_filtered = train_df[train_df['label'] != -1].copy()
    valid_df_filtered = valid_df[valid_df['label'] != -1].copy()

    ds_train = Dataset.from_pandas(train_df_filtered[['text','label']])
    ds_valid = Dataset.from_pandas(valid_df_filtered[['text','label']])
    ds = DatasetDict({'train': ds_train, 'validation': ds_valid})

    MODEL_NAME = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    def preprocess_fn(batch):
        return tokenizer(batch['text'], truncation=True, max_length=128)

    tokenized = ds.map(preprocess_fn, batched=True)
    tokenized = tokenized.rename_column("label", "labels")
    tokenized.set_format(type='torch', columns=['input_ids','attention_mask','labels'])

    id2label = {0:"Negative",1:"Neutral",2:"Positive"}
    label2id = {v:k for k,v in id2label.items()}

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=3, id2label=id2label, label2id=label2id)

    args = TrainingArguments(
        output_dir="roberta_finetuned",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        num_train_epochs=2,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        fp16=torch.cuda.is_available()
    )

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        preds = np.argmax(logits, axis=-1)
        from sklearn.metrics import accuracy_score, f1_score
        return {
            "accuracy": accuracy_score(labels, preds),
            "f1_macro": f1_score(labels, preds, average='macro')
        }

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized['train'],
        eval_dataset=tokenized['validation'],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.save_model("roberta_finetuned")
    tokenizer.save_pretrained("roberta_finetuned")
    json.dump(id2label, open("roberta_finetuned/id2label.json","w"))
else:
    print("Data not loaded; cannot fine-tune transformer.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/59119 [00:00<?, ? examples/s]

Map:   0%|          | 0/828 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbebohossam2004[0m ([33mbebohossam2004-cairo-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1 Macro
1,0.4636,0.243258,0.911836,0.912342
2,0.2964,0.144874,0.964976,0.965256


## 6) Evaluation & Comparison


In [17]:
from sklearn.metrics import classification_report
import numpy as np
import pickle
import json

if 'valid_df' in globals():
    # LSTM eval
    try:
        from tensorflow.keras.models import load_model
        tokenizer = pickle.load(open('tokenizer.pkl','rb'))
        model = load_model('lstm_best.h5')
        seqs = tokenizer.texts_to_sequences(valid_df['text_clean'].astype(str).tolist())
        from tensorflow.keras.preprocessing.sequence import pad_sequences
        X_val = pad_sequences(seqs, maxlen=120, padding='post')
        y_true = valid_df['label'].values
        y_pred = np.argmax(model.predict(X_val), axis=1)
        print("LSTM Classification Report:")
        print(classification_report(y_true, y_pred, target_names=['Negative','Neutral','Positive']))
    except Exception as e:
        print('LSTM eval skipped:', e)

    # RoBERTa eval
    try:
        import torch
        from transformers import AutoTokenizer, AutoModelForSequenceClassification
        tok = AutoTokenizer.from_pretrained("roberta_finetuned")
        model_r = AutoModelForSequenceClassification.from_pretrained("roberta_finetuned")
        texts = valid_df['text'].astype(str).tolist()
        enc = tok(texts, truncation=True, padding=True, return_tensors='pt')
        with torch.no_grad():
            out = model_r(**enc)
        preds = np.argmax(out.logits.cpu().numpy(), axis=1)
        print("RoBERTa Classification Report:")
        print(classification_report(valid_df['label'].values, preds, target_names=['Negative','Neutral','Positive']))
    except Exception as e:
        print('RoBERTa eval skipped:', e)
else:
    print("Validation data missing.")




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step
LSTM Classification Report:
              precision    recall  f1-score   support

    Negative       0.97      0.97      0.97       266
     Neutral       0.97      0.97      0.97       285
    Positive       0.97      0.97      0.97       277

    accuracy                           0.97       828
   macro avg       0.97      0.97      0.97       828
weighted avg       0.97      0.97      0.97       828

RoBERTa Classification Report:
              precision    recall  f1-score   support

    Negative       0.99      0.97      0.98       266
     Neutral       0.96      0.95      0.96       285
    Positive       0.94      0.97      0.96       277

    accuracy                           0.96       828
   macro avg       0.97      0.97      0.97       828
weighted avg       0.97      0.96      0.97       828



#7) Test with simple text

In [18]:
import numpy as np
import pandas as pd
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

model = load_model("lstm_best.h5")

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

with open("label_map.pkl", "rb") as f:
    label_map = pickle.load(f)
id2label = label_map["id2label"]

MAX_LEN = 120

manual_texts = [
    "I love this product! It's amazing ❤️",
    "This is the worst experience I've ever had.",
    "The service was okay, nothing special.",
    "I’m not sure how I feel about this one."
]

manual_seq = tokenizer.texts_to_sequences(manual_texts)
manual_pad = pad_sequences(manual_seq, maxlen=MAX_LEN, padding='post', truncating='post')
manual_pred = np.argmax(model.predict(manual_pad), axis=1)

print("\nManual sentence predictions:")
for txt, pred in zip(manual_texts, manual_pred):
    print(f"→ {txt}\n   Predicted: {id2label[pred]}")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 386ms/step

Manual sentence predictions:
→ I love this product! It's amazing ❤️
   Predicted: Positive
→ This is the worst experience I've ever had.
   Predicted: Negative
→ The service was okay, nothing special.
   Predicted: Neutral
→ I’m not sure how I feel about this one.
   Predicted: Neutral


#8) Streamlit

In [19]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.4.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.4.0-py3-none-any.whl (25 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.4.0


In [20]:
import tensorflow as tf
print(tf.__version__)


2.19.0


In [21]:
from keras.models import load_model
model_edited = load_model("lstm_best.h5", compile=False)
model_edited.save("model_fixed.keras")


In [22]:
%%writefile app.py
import streamlit as st
import pickle, json
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

st.title("Entity-aware Twitter Sentiment")

entity = st.text_input("Entity (e.g. Apple, Samsung)")
tweet = st.text_area("Tweet content")
model_choice = st.selectbox("Model", ["LSTM (fast)", "RoBERTa (accurate)"])

if st.button("Predict"):
    text = (entity.strip() + " " + tweet.strip()).strip()
    if model_choice == "LSTM (fast)":
        try:
            tokenizer = pickle.load(open('tokenizer.pkl','rb'))
            model = load_model('model_fixed.keras',compile = False)
            seq = tokenizer.texts_to_sequences([text])
            X = pad_sequences(seq, maxlen=120, padding='post')
            probs = model.predict(X)[0]
            idx = int(np.argmax(probs))
            label = ['Negative','Neutral','Positive'][idx]
            st.write("Prediction:", label)
            st.write("Probabilities:", {['Negative','Neutral','Positive'][i]: float(probs[i]) for i in range(len(probs))})
        except Exception as e:
            st.error("LSTM model not ready: " + str(e))
    else:
        try:
            tokenizer = AutoTokenizer.from_pretrained('roberta_finetuned')
            model = AutoModelForSequenceClassification.from_pretrained('roberta_finetuned')
            enc = tokenizer([text], truncation=True, padding=True, return_tensors='pt')
            with torch.no_grad():
                out = model(**enc)
            probs = torch.nn.functional.softmax(out.logits, dim=1)[0].numpy()
            idx = int(np.argmax(probs))
            label = ['Negative','Neutral','Positive'][idx]
            st.write("Prediction:", label)
            st.write("Probabilities:", {['Negative','Neutral','Positive'][i]: float(probs[i]) for i in range(len(probs))})
        except Exception as e:
            st.error("RoBERTa model not ready: " + str(e))


Writing app.py


In [23]:
from pyngrok import ngrok

!ngrok authtoken 33SvwSeEUGtt5j4VhProgiHHVSR_7UstuxKcopDM3SjrzUDvz

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [24]:
from pyngrok import ngrok

ngrok.kill()

get_ipython().system_raw('streamlit run app.py --server.port 8501 &')

public_url = ngrok.connect(8501)
print("Public URL:", public_url)

Public URL: NgrokTunnel: "https://imagistically-deepwater-manual.ngrok-free.dev" -> "http://localhost:8501"
