In [31]:
!pip install kagglehub
!pip install nltk



In [55]:
try:
    nltk.download('punkt')
    nltk.download('punkt_tab')
    nltk.download('wordnet')
    nltk.download('stopwords')
    nltk.download('averaged_perceptron_tagger')
    nltk.download('averaged_perceptron_tagger_eng')
except:
    print("Downloading punkt package was failed")

import nltk
import string
import kagglehub
import pandas as pd
import numpy as np
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import tensorflow as tf

BATCH_SIZE = 64

# Download latest version
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")
_ds = pd.read_csv(path + "/twitter_training.csv", header=None, names=["Category", "Page", "Target", "Text"])
print(_ds.shape)
_ds.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Using Colab cache for faster access to the 'twitter-entity-sentiment-analysis' dataset.
(74682, 4)


Unnamed: 0,Category,Page,Target,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [33]:
_ds["Text"]=_ds["Text"].astype('string')
print(_ds.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  74682 non-null  int64 
 1   Page      74682 non-null  object
 2   Target    74682 non-null  object
 3   Text      73996 non-null  string
dtypes: int64(1), object(2), string(1)
memory usage: 2.3+ MB
None


# Exercise With a sample text

In [34]:
text_en= "i love learn more about artificial intelligence."
text = ".من عاشق یادگیری بیشتر درباره هوش مصنوعی هستم"
tokens = word_tokenize(text_en)
print(tokens)

['i', 'love', 'learn', 'more', 'about', 'artificial', 'intelligence', '.']


In [35]:
def get_wordnet_pos(tag):
    match tag[0]:
        case 'J':
            return wordnet.ADJ
        case 'V':
            return wordnet.VERB
        case 'N':
            return wordnet.NOUN
        case 'R':
            return wordnet.ADV
        case _:
            return wordnet.NOUN

tagged_words = nltk.pos_tag(tokens)
for i in range(len(tagged_words)):
    word, tag = tagged_words[i]
    tagged_words[i] = (word, get_wordnet_pos(tag))

# tagged_words = np.array(tagged_words)
print(tagged_words)

[('i', 'n'), ('love', 'v'), ('learn', 'r'), ('more', 'a'), ('about', 'n'), ('artificial', 'a'), ('intelligence', 'n'), ('.', 'n')]


In [36]:
# remove stopwords
english_stopwords = set(stopwords.words('english'))
filtered_words = [(word,tag) for word,tag in tagged_words if word.lower() not in english_stopwords]
filtered_words = [(word,tag) for word,tag in filtered_words if word.isalpha()]
print(filtered_words)

[('love', 'v'), ('learn', 'r'), ('artificial', 'a'), ('intelligence', 'n')]


In [37]:
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word, pos=tag) for word,tag in filtered_words]
print(lemmatized_words)

['love', 'learn', 'artificial', 'intelligence']


# Let's build project

In [38]:
import keras
import tensorflow as tf
from keras.layers import Embedding, Dense, Dropout, BatchNormalization, Input, TextVectorization
from keras.models import Model

In [39]:
dataset = _ds[['Text','Target']]
dataset=dataset.dropna()
print(dataset["Target"].value_counts())

Target
Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: count, dtype: int64


In [40]:
from sklearn.preprocessing import LabelEncoder

dataset = dataset[dataset["Target"]!="Irrelevant"]
label_encoder = LabelEncoder().fit(dataset["Target"])
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
dataset["Target"] = label_encoder.transform(dataset["Target"])

print(label_mapping)

{'Negative': np.int64(0), 'Neutral': np.int64(1), 'Positive': np.int64(2)}


In [41]:
vectorize_layer = TextVectorization(
    max_tokens=25000,
    output_mode='int',
    output_sequence_length=128
)
# learning vocabulary
vectorize_layer.adapt(dataset["Text"].values)
print(f"Vocabulary Size: {vectorize_layer.vocabulary_size()}")

Vocabulary Size: 25000


In [42]:
input_layer = Input(shape=(None,))
vectorizer_layer=vectorize_layer(input_layer)
# output_dim is a hyperparameter that when grate more model will be more complex
embbed_layer = Embedding(input_dim=vectorize_layer.vocabulary_size(), output_dim=128)(vectorizer_layer)

## CNN & LSTM Channel

In [43]:
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D
cnn_path = Conv1D(filters=128, kernel_size=3, activation='relu')(embbed_layer)
cnn_path = GlobalMaxPooling1D()(cnn_path)

In [44]:
from keras.layers import LSTM
lstm_path = LSTM(128)(embbed_layer)
lstm_path = Dropout(0.4)(lstm_path)

In [45]:
from keras.layers import Concatenate
merged_path = Concatenate()([cnn_path, lstm_path])

In [46]:
dense_path = Dense(128, activation='relu')(merged_path)
# final output layer
output_layer = Dense(3, activation='softmax')(dense_path)
# building final layer
model = Model(inputs=input_layer, outputs=output_layer)

## Compile Model

In [47]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

### Train/Test Split

In [56]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(dataset["Text"], dataset["Target"], test_size=0.15, random_state=42)

xtrain = tf.constant(x_train)
xtest = tf.constant(x_test)

print(f"Train Size: {x_train.shape[0]}")
print(f"Test Size: {x_test.shape[0]}")

<_TakeDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>


In [59]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
callbacks = [
    EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)
]

model.fit(x_train, y_train, epochs=100, batch_size=128, validation_split=0.2, callbacks=callbacks)

Epoch 1/100


ValueError: as_list() is not defined on an unknown TensorShape.

In [None]:
model.evaluate(xtest, ytest)