**Q1) Sentiment analysis using deep learning**

This question uses following dataset of Urdu sentiment analysis. The class labels are P(positive)
and N (Negative)

https://github.com/MuhammadYaseenKhan/Urdu-Sentiment-Corpus/blob/master/urdu-sentiment-corpus-v1.tsv

Implement following sequence based deep learning models for the same task of sentiment
analysis. Perform binary text classification.


RNN
GRU
LSTM
BiLSTM

You can implement these models in Keras or Pytorch. Split the data into train and test set. Use
75% for training and 25% for testing.

For each of these models, try following hyper parameters and report the best results with
parameter values.
Number of layers = 2 or 3.
Dropout rate, 0.3 or 0.7
So you will have 2 *2 = 4 different sets of parameters.

Calculate accuracy, Precision, Recall and F-score for all classifiers and report the results in table.
Also report parameter values which were used to get the results.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Load the data
url = 'https://raw.githubusercontent.com/MuhammadYaseenKhan/Urdu-Sentiment-Corpus/master/urdu-sentiment-corpus-v1.tsv'
# stop_words = 'https://raw.githubusercontent.com/Delta-Sigma/urdu-stopwords/master/urdu_stopwords.txt'
# # load in stopwords
# stop_words = np.loadtxt(stop_words, dtype='str',encoding='utf-8')

df = pd.read_csv(url, delimiter='\t',encoding='utf-8')

df.head()


Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,P
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,N
2,ٹویٹر کا خیال کیسے آیا ؟,O
3,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",P
4,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,P


In [2]:
# stop_words_utf8 = stop_words.astype(str)
# stop_words_list = stop_words_utf8.tolist()
# stop_words_list


In [3]:
import keras
from keras.models import Sequential
from keras.layers import Dense

# get gpu context

from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

# model fit example for nvidia gpu
# import tensorflow as tf
# with tf.device('/gpu:0'):
#     model.fit(X_train, y_train, epochs=100)


[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14290295583784140550
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 3710910464
locality {
  bus_id: 1
  links {
  }
}
incarnation: 7345258031397774955
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9"
xla_global_id: 416903419
]


In [4]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

df.rename(columns={'Tweet': 'text', 'Class': 'label'}, inplace=True)

# Split data into train and test sets
X = df["text"]
y = df["label"]

max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
data = pad_sequences(tokenizer.texts_to_sequences(X), maxlen=max_len)
labels = LabelEncoder().fit_transform(y)

# Ensure labels are integers
# labels is a dataframe
labels = labels.astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


In [5]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 100)
(200, 100)
(800,)
(200,)


In [6]:
print(X_train.dtype)
print(X_test.dtype)
print(y_train.dtype)
print(y_test.dtype)

int32
int32
int32
int32


In [7]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout, Activation, Input, LSTM, GRU, Bidirectional

In [8]:
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.layers import GlobalMaxPooling1D

hyperparameter_combinations = [
    # comb 1
    {"num_layers": 2, "dropout_rate": 0.3},
    
    # comb 2
    {"num_layers": 2, "dropout_rate": 0.7},
    
    # comb 3
    {"num_layers": 3, "dropout_rate": 0.3},
    
    # comb 4
    {"num_layers": 3, "dropout_rate": 0.7}
]

models = [SimpleRNN, LSTM, GRU, Bidirectional(LSTM(units=64, return_sequences=True))]

results = {}

for models in models:
    for hyperparameters in hyperparameter_combinations:
        model = Sequential()
        model.add(Embedding(input_dim=10000, output_dim=100, input_length=100))
        for i in range(hyperparameters["num_layers"]):
            model.add(models(128, return_sequences=True))
            model.add(Dropout(hyperparameters["dropout_rate"]))
        model.add(GlobalMaxPooling1D())  # This will convert the output shape from (None, 100, 64) to (None, 64)
        model.add(Dense(1, activation="sigmoid"))
        model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
        model.summary()
        
        if models == Bidirectional(LSTM(units=64, return_sequences=True)):
            epoch   = 10
            batch   = 32
            model.fit(X_train, y_train, epochs=epoch, batch_size=batch, validation_data=(X_test, y_test))
            
        # X_train_pad = pad_sequences(X_train, maxlen=100, padding="post")
        # X_test_pad = pad_sequences(X_test, maxlen=100, padding="post")
        
        with tf.device('/gpu:0'):
            model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))
        
        # Make predictions
        scorer, acc = model.evaluate(X_test, y_test)
        # Calculate the classification report
        print('Scorer:', scorer)
        print('Accuracy:', acc)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          1000000   
                                                                 
 simple_rnn (SimpleRNN)      (None, 100, 128)          29312     
                                                                 
 dropout (Dropout)           (None, 100, 128)          0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 100, 128)          32896     
                                                                 
 dropout_1 (Dropout)         (None, 100, 128)          0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                        

ValueError: Input 0 of layer "bidirectional" is incompatible with the layer: expected ndim=3, found ndim=0. Full shape received: ()

This question uses same dataset of Q 1. Perform the task of binary classification on the dataset.
Choose one classifier from deep learning models implemented in Question 1 based on best
results on F-measure for binary classification.
Use following embedding for vector representation and report the results. You need to train
the embeddings yourself on the given Urdu dataset.
1) WordToVec

https://mccormickml.com/2016/04/12/googles-pretrained-word2vec-model-in-python/
2) Glove
https://nlp.stanford.edu/projects/glove/

https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-
python-d38905f356db

3) Fasttext

https://blogs.sap.com/2019/07/03/glove-and-fasttext-two-popular-word-vector-
models-in-nlp/

https://fasttext.cc/docs/en/english-vectors.html
4) Elmo (it creates embeddings for sentences, so use entire tweet as imput to get the
vector) https://github.com/HIT-SCIR/ELMoForManyLangs

Calculate accuracy, Precision, Recall and F-score for all classifiers and report the results in
tables. For example, if LSTM (use any fixed hyper-parameters that gave best results) had best
overall results among deep learning models in your assignment 1 then make the following table
of results.

LSTM
(without
embeddings)

LSTM with
WordToVec

LSTM with
Glove

LSTM with
Fasttext

LSTM with
Elmo

F-score
Accuracy
Precision
Recall

In [3]:
!pip install --use-pep517 pybind11



In [2]:
!pip install gensim



In [4]:
!pip install --use-pep517 fasttext

Collecting fasttext
  Using cached fasttext-0.9.2.tar.gz (68 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'error'


  error: subprocess-exited-with-error
  
  × Getting requirements to build wheel did not run successfully.
  │ exit code: 1
  ╰─> [28 lines of output]
      c:\Users\ammar\anaconda3\python.exe: No module named pip
      Traceback (most recent call last):
        File "<string>", line 38, in __init__
      ModuleNotFoundError: No module named 'pybind11'
      
      During handling of the above exception, another exception occurred:
      
      Traceback (most recent call last):
        File "c:\Users\ammar\anaconda3\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 353, in <module>
          main()
        File "c:\Users\ammar\anaconda3\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_process.py", line 335, in main
          json_out['return_val'] = hook(**hook_input['kwargs'])
                                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "c:\Users\ammar\anaconda3\Lib\site-packages\pip\_vendor\pyproject_hooks\_in_process\_in_pr

In [5]:
!pip install --use-pep517 allennlp

  error: subprocess-exited-with-error
  
  × pip subprocess to install build dependencies did not run successfully.
  │ exit code: 1
  ╰─> [339 lines of output]
      Collecting setuptools
        Using cached setuptools-69.2.0-py3-none-any.whl.metadata (6.3 kB)
      Collecting wheel<0.33.0,>0.32.0
        Using cached wheel-0.32.3-py2.py3-none-any.whl.metadata (2.1 kB)
      Collecting Cython
        Using cached Cython-3.0.10-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
      Collecting cymem<2.1.0,>=2.0.2
        Using cached cymem-2.0.8-cp311-cp311-win_amd64.whl.metadata (8.6 kB)
      Collecting preshed<2.1.0,>=2.0.1
        Using cached preshed-2.0.1.tar.gz (113 kB)
        Preparing metadata (setup.py): started
        Preparing metadata (setup.py): finished with status 'done'
      Collecting murmurhash<1.1.0,>=0.28.0
        Using cached murmurhash-1.0.10-cp311-cp311-win_amd64.whl.metadata (2.0 kB)
      Collecting thinc<7.1.0,>=7.0.8
        Using cached thinc-7.0.8.tar.gz (1

Collecting allennlp
  Using cached allennlp-2.10.1-py3-none-any.whl.metadata (21 kB)
INFO: pip is looking at multiple versions of allennlp to determine which version is compatible with other requirements. This could take a while.
  Using cached allennlp-2.10.0-py3-none-any.whl.metadata (20 kB)
  Using cached allennlp-2.9.3-py3-none-any.whl.metadata (19 kB)
  Using cached allennlp-2.9.2-py3-none-any.whl.metadata (19 kB)
  Using cached allennlp-2.9.1-py3-none-any.whl.metadata (19 kB)
  Using cached allennlp-2.9.0-py3-none-any.whl.metadata (18 kB)
  Using cached allennlp-2.8.0-py3-none-any.whl.metadata (17 kB)
  Using cached allennlp-2.7.0-py3-none-any.whl.metadata (17 kB)
INFO: pip is still looking at multiple versions of allennlp to determine which version is compatible with other requirements. This could take a while.
  Using cached allennlp-2.6.0-py3-none-any.whl.metadata (17 kB)
  Using cached allennlp-2.5.0-py3-none-any.whl.metadata (17 kB)
  Using cached allennlp-2.4.0-py3-none-any

In [11]:
!tar -xvzf glove.6B.zip

x glove.6B.50d.txt
x glove.6B.100d.txt
x glove.6B.200d.txt
x glove.6B.300d.txt


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from gensim.models import Word2Vec, KeyedVectors
import fasttext.util
from allennlp.commands.elmo import ElmoEmbedder

# Load the data
url = 'https://raw.githubusercontent.com/MuhammadYaseenKhan/Urdu-Sentiment-Corpus/master/urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(url, delimiter='\t')

# Split the data into training and testing sets
train_data, test_data = train_test_split(df, test_size=0.25, random_state=42)

# Tokenize the text data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['text'])

# Convert the text data into sequences of integers
train_sequences = tokenizer.texts_to_sequences(train_data['text'])
test_sequences = tokenizer.texts_to_sequences(test_data['text'])

# Pad the sequences
train_data = pad_sequences(train_sequences, maxlen=100)
test_data = pad_sequences(test_sequences, maxlen=100)

# Train a Word2Vec model
word2vec = Word2Vec(train_data['text'], size=100, window=5, min_count=1, workers=4)

# Load the GloVe embeddings
glove = KeyedVectors.load_word2vec_format('glove.6B.100d.txt', binary=False)

# Train a FastText model
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')

# Initialize the ELMo embedder
elmo = ElmoEmbedder()

# Create the LSTM model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=100))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(train_data, train_labels, epochs=10, validation_data=(test_data, test_labels))

# Evaluate the model
accuracy, precision, recall, f_score = evaluate(model, test_data)