In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from gensim.models import Word2Vec
import pandas as pd
from collections import Counter
import numpy as np
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
class UrduSentimentDataset:
    def __init__(self, csv_file, max_vocab_size=None):
        self.df = pd.read_csv(csv_file, delimiter='\t')
        self.df['Class'] = self.df['Class'].map({'P': 1, 'N': 0})
        self.tokenize_and_pad(max_vocab_size)

    def tokenize_and_pad(self, max_vocab_size):
        all_text = ' '.join(self.df['Tweet'])
        words = all_text.split()
        word_counts = Counter(words)
        sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True)
        if max_vocab_size is not None:
            sorted_vocab = sorted_vocab[:max_vocab_size]
        self.int_to_vocab = {k: w for k, w in enumerate(sorted_vocab)}
        self.vocab_to_int = {w: k for k, w in self.int_to_vocab.items()}

        self.encoded_tweets = [[self.vocab_to_int.get(word, 0) for word in tweet.split()] for tweet in self.df['Tweet']]

        max_len = max([len(tweet) for tweet in self.encoded_tweets])
        self.padded_tweets = np.array([tweet + [0]*(max_len-len(tweet)) for tweet in self.encoded_tweets])

    def get_data(self):
        return self.padded_tweets, self.df['Class'], self.vocab_to_int

    def print_samples(self, num_samples=5):
        print("Random samples from the dataset:")
        samples_indices = np.random.choice(len(self.df), num_samples, replace=False)
        for idx in samples_indices:
            tweet = self.df.loc[idx, 'Tweet']
            label = self.df.loc[idx, 'Class']
            print(f"Tweet: {tweet} | Label: {'Positive' if label == 1 else 'Negative'}")

    def preprocess_data(self, test_size=0.25, random_state=42):
        X, y = self.padded_tweets, self.df['Class']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

        X_train_clean = X_train.copy()
        y_train_clean = y_train.copy()
        X_test_clean = X_test.copy()
        y_test_clean = y_test.copy()

        nan_mask_train = ~np.isnan(y_train_clean)
        X_train_clean = X_train_clean[nan_mask_train]
        y_train_clean = y_train_clean[nan_mask_train]

        nan_mask_test = ~np.isnan(y_test_clean)
        X_test_clean = X_test_clean[nan_mask_test]
        y_test_clean = y_test_clean[nan_mask_test]

        y_train_clean = y_train_clean.astype(int)
        y_test_clean = y_test_clean.astype(int)

        y_train_clean = to_categorical(y_train_clean, num_classes=2)
        y_test_clean = to_categorical(y_test_clean, num_classes=2)

        return X_train_clean, X_test_clean, y_train_clean, y_test_clean

In [None]:
URL = 'https://raw.githubusercontent.com/MuhammadYaseenKhan/Urdu-Sentiment-Corpus/master/urdu-sentiment-corpus-v1.tsv'
dataset = pd.read_csv(URL, delimiter='\t')
dataset.to_csv('urdu-sentiment-corpus-v1.tsv', sep='\t', index=False)

dataset = UrduSentimentDataset('urdu-sentiment-corpus-v1.tsv')
dataset.print_samples(num_samples=5)

Random samples from the dataset:
Tweet:  بھائ نہ جھک مارو نہ حق بس تمیز سے زندگی گزارو | Label: Positive
Tweet: نئے بجلی میٹرز کے حوالے سے خبریں بے بنیاد : وزیر اعظم معائنہ کمیشن  | Label: Negative
Tweet: سیکیورٹی خدشہ،عمران خان کے کنٹینر کے پیچھے 2 کنٹینر رکھ دیئے  | Label: Positive
Tweet:  خدانخواسته عمران خان تیرے گھر گھس گیا تو کیا هو گا اس کی تصویر بھی لگا دو | Label: Negative
Tweet: دوسرے اضلاع سے آنے والے ٹریفک وارڈنز اور افسران اپنے وائرلیس سیٹ اور گاڑیاں بھی ساتھ لائیں گے ۔  | Label: Positive


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import pandas as pd

# Load your Urdu dataset
dataset = pd.read_csv('urdu-sentiment-corpus-v1.tsv', delimiter='\t')

# Tokenize the text data
tokenized_tweets = [word_tokenize(tweet) for tweet in dataset['Tweet']]

# Train Word2Vec embeddings
embedding_dim = 100  # You can adjust the embedding dimension as needed
word2vec_model = Word2Vec(sentences=tokenized_tweets, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# Save the trained embeddings to disk
word2vec_model.save("word2vec_urdu_embeddings.model")


In [None]:
from gensim.models import Word2Vec

# Load the pre-trained embeddings
word2vec_model = Word2Vec.load("word2vec_urdu_embeddings.model")

# Get the embedding vector for a specific word
word_vector = word2vec_model.wv['سلام']  # Replace 'سلام' with the desired word

In [None]:
!pip install --upgrade --force-reinstall glove_python


Collecting glove_python
  Using cached glove_python-0.1.0.tar.gz (263 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting numpy (from glove_python)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy (from glove_python)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.4/38.4 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: glove_python
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem

In [None]:
!pip install numpy scipy




In [None]:
from nltk.tokenize import word_tokenize
import pandas as pd

# Load your Urdu dataset
dataset = pd.read_csv('urdu-sentiment-corpus-v1.tsv', delimiter='\t')

# Tokenize the text data
tokenized_tweets = [word_tokenize(tweet) for tweet in dataset['Tweet']]

# Save tokenized tweets to a text file
with open('tokenized_tweets.txt', 'w', encoding='utf-8') as f:
    for tweet_tokens in tokenized_tweets:
        tweet_str = ' '.join(tweet_tokens)
        f.write(tweet_str + '\n')


In [None]:
from glove import Corpus, Glove

# Train GloVe embeddings
corpus = Corpus()
corpus.fit(tokenized_tweets, window=5)
glove_model = Glove(no_components=embedding_dim, learning_rate=0.05)
glove_model.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove_model.add_dictionary(corpus.dictionary)

# Save the trained embeddings to disk
glove_model.save("glove_urdu_embeddings.model")


ModuleNotFoundError: No module named 'glove'

In [None]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199772 sha256=cecd59aa62eb7ff38e174f5a3fa9fa02e717fa9ed781ca88c1da3d508bcaf84c
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:
import fasttext

# Train FastText embeddings
model = fasttext.train_unsupervised('urdu-sentiment-corpus-v1.tsv', model='skipgram', dim=embedding_dim)

# Save the trained embeddings to disk
model.save_model("fasttext_urdu_embeddings.bin")


In [None]:
# Save tokenized tweets to a text file
with open("urdu_tweets.txt", "w") as f:
    for tweet in tokenized_tweets:
        f.write(" ".join(tweet) + "\n")

# Train FastText embeddings
import fasttext
fasttext_model = fasttext.train_unsupervised("urdu_tweets.txt", model='skipgram', dim=embedding_dim)

# Save the trained embeddings to disk
fasttext_model.save_model("fasttext_urdu_embeddings.bin")


In [None]:
import fasttext

# Load the trained FastText model
fasttext_model = fasttext.load_model("fasttext_urdu_embeddings.bin")

# Get the embedding vector for a specific word
word = "سلام" # Replace with the desired Urdu word
word_vector = fasttext_model.get_word_vector(word)

# Print the word and its vector
print(f"Word: {word}")
print(f"Vector: {word_vector}")



Word: سلام
Vector: [ 0.08545976  0.03895957  0.05232018 -0.01832284  0.21562408  0.07658491
 -0.16425034  0.02151671  0.12563014  0.24210063 -0.12327452 -0.04418094
 -0.03638594  0.05983236  0.05899256  0.07088875  0.05895574  0.07567459
 -0.01947038 -0.11628506  0.00513881 -0.02856266  0.0447617  -0.07029486
 -0.02756987 -0.07749857  0.04043348 -0.122174   -0.09659746  0.26377687
 -0.06651026 -0.05298331 -0.18661416 -0.00845781 -0.23886631 -0.07387757
  0.05268278 -0.16007702 -0.2942576   0.00972584 -0.05140448  0.02618704
 -0.04078184  0.07400005  0.19353682 -0.15319799  0.00567065 -0.06872339
 -0.07694431  0.11209474 -0.16405527 -0.00406146 -0.22993062  0.15368865
 -0.02056284 -0.12981896  0.03362524  0.05526285  0.05316765  0.11128714
  0.04779462 -0.06031324  0.26931387  0.11814883 -0.04205162  0.06585701
  0.03494747  0.19793032  0.10714965 -0.00047629 -0.28482154  0.13684209
  0.06011401  0.10452374 -0.08700595 -0.05810168  0.13502426  0.07547034
  0.23994885 -0.01147719  0.0731