In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB

In [3]:
from datasets import load_dataset
dataset = load_dataset("carblacac/twitter-sentiment-analysis")
text = dataset['train']['text']
feeling = dataset['train']['feeling']

Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/149985 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/120 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Generating train split:   0%|          | 0/119988 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29997 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/61998 [00:00<?, ? examples/s]

In [15]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(text, feeling, test_size=0.2, random_state=3113)
print(len(x_train), len(x_test))

95990 23998


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(), MultinomialNB())
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.84      0.77     11960
           1       0.81      0.68      0.74     12038

    accuracy                           0.76     23998
   macro avg       0.76      0.76      0.76     23998
weighted avg       0.76      0.76      0.76     23998



In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline

pipe = make_pipeline(TfidfVectorizer(), SVC())
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.79      0.80     11960
           1       0.80      0.80      0.80     12038

    accuracy                           0.80     23998
   macro avg       0.80      0.80      0.80     23998
weighted avg       0.80      0.80      0.80     23998



In [7]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

In [8]:
def preprocess(data):
  return {'text':data['text'],'feeling':data['feeling']}
train_data = dataset['train'].map(preprocess)
test_data = dataset['test'].map(preprocess)

Map:   0%|          | 0/119988 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

In [9]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(train_data['text'])
vocab_size = len(tokenizer.word_index)+1

In [10]:
train_seq = tokenizer.texts_to_sequences(train_data['text'])
test_seq = tokenizer.texts_to_sequences(test_data['text'])

In [11]:
max_len = 32
trainp = tf.keras.preprocessing.sequence.pad_sequences(train_seq, maxlen=max_len)
testp = tf.keras.preprocessing.sequence.pad_sequences(test_seq, maxlen=max_len)

In [12]:
embed_len = 32
model = Sequential([
    Embedding(input_dim=vocab_size,output_dim=embed_len,input_length=max_len),
    Bidirectional(LSTM(64,return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [13]:
import numpy
y_train = numpy.array(train_data['feeling'])
y_test = numpy.array(test_data['feeling'])
model.fit(trainp, y_train, epochs=4, validation_data=(testp,y_test))

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x7a7a82b79f60>