<a href="https://colab.research.google.com/github/DaryaJavadi/data_science/blob/main/projects/twitter_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/jp797498e/twitter-entity-sentiment-analysis?dataset_version_number=2...


100%|██████████| 1.99M/1.99M [00:00<00:00, 3.36MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2





In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(path + "/twitter_training.csv")
df

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
...,...,...,...,...
74676,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74677,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74678,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74679,9200,Nvidia,Positive,Just realized between the windows partition of...


In [None]:
df.rename(columns={
    '2401': 'm',
    'Borderlands': 'n',
    'Positive': 'Sentiment',
    'im getting on borderlands and i will murder you all ,': 'Sentence'
}, inplace=True)

In [None]:
df['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Negative,22542
Positive,20831
Neutral,18318
Irrelevant,12990


In [None]:
df.dropna(subset=['Sentiment', 'Sentence'], inplace=True)

In [None]:
from sklearn.model_selection import train_test_split

X = df['Sentence']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("all", X.shape, y.shape)
print("train", X_train.shape, y_train.shape)
print("test", X_test.shape, y_test.shape)

all (73995,) (73995,)
train (66595,) (66595,)
test (7400,) (7400,)


In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
text_pipeline = make_pipeline(
    TfidfVectorizer(
        stop_words='english',
        max_features=1000
        ),
)

X_transformed = text_pipeline.fit_transform(X)
X_train_transformed = text_pipeline.fit_transform(X_train)
X_test_transformed = text_pipeline.transform(X_test)

X_transformed.shape, X_train_transformed.shape, X_test_transformed.shape,

((73995, 1000), (66595, 1000), (7400, 1000))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_transformed, y_train)
rf_clf.score(X_test_transformed, y_test)

0.8475675675675676

In [None]:
from sklearn.metrics import classification_report
from sklearn.calibration import cross_val_predict

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
cv_predictions = cross_val_predict(rf_clf, X_transformed, y, cv=3)

print(classification_report(y, cv_predictions))

              precision    recall  f1-score   support

  Irrelevant       0.22      0.15      0.18     12875
    Negative       0.47      0.50      0.48     22358
     Neutral       0.46      0.40      0.43     18108
    Positive       0.44      0.55      0.49     20654

    accuracy                           0.43     73995
   macro avg       0.40      0.40      0.39     73995
weighted avg       0.41      0.43      0.42     73995



# with RNN

In [None]:
import tensorflow as tf

vocab_size = 10000
max_len = 200

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(df['Sentence'].astype(str))

In [None]:
sequences = tokenizer.texts_to_sequences(df['Sentence'].astype(str))

padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=max_len)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
labels = le.fit_transform(df['Sentiment'])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.1, random_state=42)

x_train = tf.keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_len)
x_test = tf.keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_len)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    tf.keras.layers.LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    tf.keras.layers.Dense(1, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

test_loss, test_acc = model.evaluate(x_test, y_test)
print(f"Test Accuracy: {test_acc:.4f}")



Epoch 1/5


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m527s[0m 623ms/step - accuracy: 0.3018 - loss: 0.0000e+00 - val_accuracy: 0.3019 - val_loss: 0.0000e+00
Epoch 2/5
[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m536s[0m 643ms/step - accuracy: 0.3053 - loss: 0.0000e+00 - val_accuracy: 0.3019 - val_loss: 0.0000e+00
Epoch 3/5
[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m561s[0m 642ms/step - accuracy: 0.3049 - loss: 0.0000e+00 - val_accuracy: 0.3019 - val_loss: 0.0000e+00
Epoch 4/5
[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 644ms/step - accuracy: 0.3048 - loss: 0.0000e+00 - val_accuracy: 0.3019 - val_loss: 0.0000e+00
Epoch 5/5
[1m833/833[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m564s[0m 647ms/step - accuracy: 0.3025 - loss: 0.0000e+00 - val_accuracy: 0.3019 - val_loss: 0.0000e+00
[1m232/232[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 109ms/step - accuracy: 0.2923 - loss: 0.0000e+00
Test Accuracy: 0.2968


# With attentions

In [None]:
import tensorflow as tf

num_classes = len(df['label'].unique())

encoder_inputs = tf.keras.Input(shape=(max_len,))
encoder_embedding = tf.keras.layers.Embedding(vocab_size, 128)(encoder_inputs)
encoder_lstm = tf.keras.layers.LSTM(64, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = tf.keras.Input(shape=(max_len,))
decoder_embedding = tf.keras.layers.Embedding(vocab_size, 128)(decoder_inputs)
decoder_lstm = tf.keras.layers.LSTM(64, return_sequences=False, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

decoder_dense = tf.keras.layers.Dense(num_classes, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)
num_classes = len(df['label'].unique())

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

decoder_dense = tf.keras.layers.Dense(num_classes, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

encoder_input_data = padded_sequences
decoder_input_data = padded_sequences

decoder_target_data = tf.keras.utils.to_categorical(df['label_encoded'][:len(encoder_input_data)], num_classes=num_classes)

model.fit([encoder_input_data, decoder_input_data], decoder_target_data, epochs=5)

Epoch 1/5
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 17ms/step - accuracy: 0.4582 - loss: 1.2407
Epoch 2/5
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 19ms/step - accuracy: 0.6490 - loss: 0.9679
Epoch 3/5
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 17ms/step - accuracy: 0.7204 - loss: 0.8244
Epoch 4/5
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 17ms/step - accuracy: 0.7571 - loss: 0.7457
Epoch 5/5
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 17ms/step - accuracy: 0.7782 - loss: 0.6985


<keras.src.callbacks.history.History at 0x7c5e7af83b10>

In [None]:
label_map = df.drop_duplicates('label').sort_values('label')['label'].tolist()

sentence = 'i '
tokens = tokenizer(sentence, max_length=200, padding='max_length', truncation=True, return_tensors='tf')

pred = model.predict([tokens['input_ids'], tokens['attention_mask']])
predicted_index = int(np.argmax(pred))

print("Predicted sentiment:", label_map[predicted_index])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Predicted sentiment: Neutral


# With hugging face

In [None]:
!pip install -q datasets transformers
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from transformers import BertTokenizer, TFBertForSequenceClassification, DataCollatorWithPadding
import tensorflow as tf
from datasets import Dataset
import numpy as np
import pyarrow as pa  # Import pyarrow for type checking

# Modified _arrow_array_to_numpy function
from datasets.formatting.formatting import NumpyArrowExtractor

class SafeNumpyArrowExtractor(NumpyArrowExtractor):
    def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
        """
        Handles the conversion from Arrow arrays to NumPy arrays, ensuring proper handling of object
        and string types to avoid potential issues.
        """
        if pa_array.type == pa.string():
            return np.array([x.as_py() if not x.is_null else '' for x in pa_array], dtype=object)
        array = pa_array.to_pylist()
        if all(isinstance(x, bytes) for x in array):
            return np.array([x.decode('utf-8') for x in array], dtype=object)
        if all(x is None or isinstance(x, str) for x in array):
            array = [x if x is not None else '' for x in array]
            return np.array(array, dtype=object, copy=False)
        return np.asarray(array, copy=False)

# Replace the default NumpyArrowExtractor with the SafeNumpyArrowExtractor
# from datasets.formatting.formatting import _TYPE_TO_ARROW_FORMATTER_CLS

# _TYPE_TO_ARROW_FORMATTER_CLS["numpy"] = SafeNumpyArrowExtractor
# _TYPE_TO_ARROW_FORMATTER_CLS["python"] = SafeNumpyArrowExtractor

# Load dataset
# # Replace 'path' with the actual path to your dataset
# path = '/root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis'
# df = pd.read_csv(path + "/twitter_training.csv", names=['user_id', 'topic', 'label', 'text'])
df.dropna(subset=['text', 'label'], inplace=True)

# Encode labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])

# Split data
from sklearn.model_selection import train_test_split
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Load tokenizer
checkpoint = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(checkpoint)

# Convert to Hugging Face Dataset
train_ds = Dataset.from_pandas(pd.DataFrame({'text': X_train, 'label_encoded': y_train.map(lambda x: le.transform([x])[0])}))
test_ds = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label_encoded': y_test.map(lambda x: le.transform([x])[0])}))

# Tokenize
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

train_ds = train_ds.map(tokenize, batched=True)
test_ds = test_ds.map(tokenize, batched=True)

# Format for TensorFlow
train_ds.set_format(type="tensorflow", columns=['input_ids', 'attention_mask', 'label_encoded'])
test_ds.set_format(type="tensorflow", columns=['input_ids', 'attention_mask', 'label_encoded'])

# Convert to tf.data.Dataset
train_tf = train_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols="label_encoded",
    shuffle=True,
    batch_size=16,
    collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf"),
)
test_tf = test_ds.to_tf_dataset(
    columns=['input_ids', 'attention_mask'],
    label_cols="label_encoded",
    shuffle=False,
    batch_size=64,
    collate_fn=DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf"),
)

# Load model
model = TFBertForSequenceClassification.from_pretrained(checkpoint, num_labels=4)

# Compile model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]

model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

# Train model
model.fit(train_tf, validation_data=test_tf, epochs=3)

# Evaluate
y_true = []
y_pred = []

for batch in test_tf:
    labels = batch[1].numpy()
    preds = model.predict(batch[0]).logits.argmax(axis=1)
    y_true.extend(labels)
    y_pred.extend(preds)

print(classification_report(y_true, y_pred, target_names=le.classes_))

# Prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="tf", padding=True, truncation=True, max_length=128)
    logits = model(inputs).logits
    pred = tf.argmax(logits, axis=1).numpy()[0]
    return le.inverse_transform([pred])[0]

# Example predictions
print(predict_sentiment("This is awesome!"))
print(predict_sentiment("I hate this thing."))
print(predict_sentiment("Let me check again later."))

Map:   0%|          | 0/66596 [00:00<?, ? examples/s]

Map:   0%|          | 0/7400 [00:00<?, ? examples/s]

ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.