# Path To file

In [26]:
import pandas as pd
import re


file_path = 'urdu_sarcastic_dataset.csv'
df = pd.read_csv(file_path)
df.head()


Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,لینے میری شادی فسادن کوجی نہیں چاہیے,1.0,,,,,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,,,,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی اپوزیش...,0.0,,,,,,
3,مراد علی شاہ بھیس میں ڈی جی ایس حامد میر,1.0,,,,,,
4,قابل اعتبار قاتل اعتبار,1.0,,,,,,


# Phase 1: Text Preprocessing for Urdu Text
1. Stopword Removal:
o Develop a custom list of Urdu stopwords (e.g., &quot;اور&quot;, &quot;یہ&quot;, &quot;کہ&quot;). Write a function
to remove these stopwords from your dataset of social media posts.
o Challenges to Address: Handle words that are often considered stopwords but
may carry sentiment (e.g., &quot;نہیں&quot; (no), &quot;برا&quot; (bad)).

In [27]:
first_column = df.columns[0] 


stopwords_file_path = 'stopwords-ur.txt' 
with open(stopwords_file_path, 'r', encoding='utf-8') as f:
    urdu_stopwords = f.read().splitlines()

#function to remove stop words
def remove_stopwords(text, stopwords):
    # Tokenize the text by splitting it into words
    words = re.findall(r'\w+', text)
    words = [word for word in words if word not in stopwords]
    return ' '.join(words)

df[first_column] = df[first_column].apply(lambda x: remove_stopwords(str(x), urdu_stopwords))

print("\nCleaned Dataset (first 5 rows):")
print(df.head())

df.to_csv(file_path, index=False)



KeyboardInterrupt: 

# 2. Punctuation, Emojis, and Hashtags:
o Remove unnecessary punctuation, emojis, URLs, and hashtags that don’t
contribute to sentiment.

In [13]:
from IPython.display import display  

first_column = df.columns[0] 

# Function to clean text (remove punctuation, emojis, URLs, and hashtags)
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove emojis (non-word characters)
    text = re.sub(r'[^\w\s]', '', text)  
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    return text

df[first_column] = df[first_column].apply(lambda x: clean_text(str(x)))  # Clean text by removing emojis, punctuation, URLs, and hashtags

print("\nCleaned Dataset (first 5 rows):")
display(df.head()) 

df.to_csv(file_path, index=False)




Cleaned Dataset (first 5 rows):


Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,لینے میری شادی فسادن کوجی نہیں چاہیے,1.0,,,,,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,,,,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی اپوزیش...,0.0,,,,,,
3,مراد علی شاہ بھیس میں ڈی جی ایس حامد میر,1.0,,,,,,
4,قابل اعتبار قاتل اعتبار,1.0,,,,,,


# 3. Short Conversations:
o Write a rule-based function to filter out very short posts or those with less than
three words, as they may not carry sufficient sentiment.

In [14]:
def filter_short_posts(text):
    words = text.split()  
    if len(words) < 3:
        return None  
    return text  

df[first_column] = df[first_column].apply(lambda x: filter_short_posts(str(x)))

df = df.dropna(subset=[first_column])

print("\nFiltered Dataset (first 5 rows):")
display(df.head())  

df.to_csv(file_path, index=False)



Filtered Dataset (first 5 rows):


Unnamed: 0,urdu_text,is_sarcastic,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,لینے میری شادی فسادن کوجی نہیں چاہیے,1.0,,,,,,
1,چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...,1.0,,,,,,
2,کامران خان آپکی دن بھریہ زمہ داری لگائی اپوزیش...,0.0,,,,,,
3,مراد علی شاہ بھیس میں ڈی جی ایس حامد میر,1.0,,,,,,
4,قابل اعتبار قاتل اعتبار,1.0,,,,,,


# Phase 2: Stemming and Lemmatization for Urdu Text
1. Stemming:
o Implement or utilize a stemming algorithm for Urdu. The algorithm should reduce
word variants to their base form (e.g., &quot;اچھا&quot;, &quot;اچھی&quot;, &quot;اچھے&quot; → &quot;اچھا&quot;).
o Challenges to Address: Handling word inflections due to gender and plurality in
Urdu.

In [15]:
first_column = df.columns[0]  

urdu_suffixes = ['ا', 'ی', 'ے', 'وں', 'یں', 'ہ']

def urdu_stemmer(word):
    for suffix in urdu_suffixes:
        if word.endswith(suffix):
            return word[:-len(suffix)]  
    return word  

def apply_stemming(text):
    words = text.split()  
    stemmed_words = [urdu_stemmer(word) for word in words]  
    return ' '.join(stemmed_words) 

df[first_column] = df[first_column].apply(lambda x: apply_stemming(str(x)))

print("\nStemmed Dataset (first 5 rows):")
print(df.head())

output_file_path = 'stem_urdu.csv'
df.to_csv(output_file_path, index=False, encoding='utf-8')




Stemmed Dataset (first 5 rows):
                                           urdu_text  is_sarcastic  \
0                      لین میر شاد فسادن کوج نہ چاہی           1.0   
1         چل مہمان م کھان سرو کر چڑیل چاچ ن دسد آں م           1.0   
2  کامران خان آپک دن بھری زم دار لگائ اپوزیشن کرد...           0.0   
3                 مراد عل شا بھیس م ڈ ج ایس حامد میر           1.0   
4                            قابل اعتبار قاتل اعتبار           1.0   

   Unnamed: 2  Unnamed: 3  Unnamed: 4  Unnamed: 5 Unnamed: 6  Unnamed: 7  
0         NaN         NaN         NaN         NaN        NaN         NaN  
1         NaN         NaN         NaN         NaN        NaN         NaN  
2         NaN         NaN         NaN         NaN        NaN         NaN  
3         NaN         NaN         NaN         NaN        NaN         NaN  
4         NaN         NaN         NaN         NaN        NaN         NaN  


# 2. Lemmatization:
o Implement lemmatization for Urdu, which requires using dictionaries or rules to
return words to their dictionary form.
o Expected Output: For example, &quot;چل رہی&quot; (is moving) should be reduced to &quot;چل&quot;
(move).

In [16]:
file_path = 'urdu_sarcastic_dataset.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

urdu_lemmas = {
    # Movement and Action Verbs
    "چل رہی": "چل", "چل رہے": "چل", "چلا": "چل", "چلیں": "چل", "چلی": "چل", "چلایا": "چلا", "چلاتے": "چلا",
    "کرتے": "کر", "کررہا": "کر", "کررہی": "کر", "کرچکے": "کر", "کرچکی": "کر", "کرتی": "کر", "کریں": "کر", "کیا": "کر",
    "آئے": "آ", "آرہا": "آ", "آرہی": "آ", "آچکا": "آ", "آچکی": "آ", "آئیں": "آ", "آیا": "آ",
    "گیا": "جا", "جاتا": "جا", "جارہا": "جا", "جارہی": "جا", "جائیں": "جا", "جاچکا": "جا", "جاچکی": "جا",
    "پڑھتے": "پڑھ", "پڑھتی": "پڑھ", "پڑھ رہا": "پڑھ", "پڑھ رہی": "پڑھ", "پڑھ چکے": "پڑھ", "پڑھ چکی": "پڑھ", "پڑھا": "پڑھ",
    "دیتے": "دو", "دیتی": "دو", "دے": "دو", "دے رہا": "دو", "دے رہی": "دو", "دے چکا": "دو", "دے چکی": "دو",
    "لیتے": "لے", "لیتی": "لے", "لے رہا": "لے", "لے رہی": "لے", "لے چکا": "لے", "لے چکی": "لے", "لیا": "لے",
    "کہا": "کہ", "کہتے": "کہ", "کہتی": "کہ", "کہ رہا": "کہ", "کہ رہی": "کہ", "کہ چکے": "کہ", "کہ چکی": "کہ", "کہا": "کہ",
    "دیکھا": "دیکھ", "دیکھتے": "دیکھ", "دیکھ رہی": "دیکھ", "دیکھ رہا": "دیکھ", "دیکھ چکا": "دیکھ", "دیکھ چکی": "دیکھ",
    "بنایا": "بنا", "بناتے": "بنا", "بناتی": "بنا", "بنا رہا": "بنا", "بنا رہی": "بنا", "بنا چکا": "بنا", "بنا چکی": "بنا",
    "سنا": "سن", "سنتے": "سن", "سنتی": "سن", "سن رہا": "سن", "سن رہی": "سن", "سن چکا": "سن", "سن چکی": "سن",
    "بول": "بول", "بولتا": "بول", "بولتی": "بول", "بول رہے": "بول", "بول رہی": "بول", "بول چکا": "بول", "بول چکی": "بول",
    "کھایا": "کھا", "کھاتے": "کھا", "کھاتی": "کھا", "کھا رہا": "کھا", "کھا رہی": "کھا", "کھا چکا": "کھا", "کھا چکی": "کھا",
    "لکھا": "لکھ", "لکھتے": "لکھ", "لکھتی": "لکھ", "لکھ رہا": "لکھ", "لکھ رہی": "لکھ", "لکھ چکا": "لکھ", "لکھ چکی": "لکھ",
    "سویا": "سو", "سوتے": "سو", "سوتی": "سو", "سو رہا": "سو", "سو رہی": "سو", "سو چکا": "سو", "سو چکی": "سو",
    "بیٹھا": "بیٹھ", "بیٹھتے": "بیٹھ", "بیٹھ رہی": "بیٹھ", "بیٹھ رہا": "بیٹھ", "بیٹھ چکا": "بیٹھ", "بیٹھ چکی": "بیٹھ",
    "چکھا": "چکھ", "چکھتے": "چکھ", "چکھتی": "چکھ", "چکھ رہا": "چکھ", "چکھ رہی": "چکھ", "چکھ چکا": "چکھ", "چکھ چکی": "چکھ",
    "پکایا": "پکا", "پکاتے": "پکا", "پکاتی": "پکا", "پکا رہا": "پکا", "پکا رہی": "پکا", "پکا چکا": "پکا", "پکا چکی": "پکا",
    "گایا": "گا", "گاتے": "گا", "گاتی": "گا", "گا رہا": "گا", "گا رہی": "گا", "گا چکا": "گا", "گا چکی": "گا",
    # Social Interactions
    "ملا": "مل", "ملتی": "مل", "ملتے": "مل", "مل رہا": "مل", "مل رہی": "مل", "مل چکا": "مل", "مل چکی": "مل",
    "لگتا": "لگ", "لگ رہی": "لگ", "لگ رہا": "لگ", "لگتی": "لگ", "لگ چکا": "لگ", "لگ چکی": "لگ",
    "رکھا": "رکھ", "رکھتی": "رکھ", "رکھتے": "رکھ", "رکھ رہی": "رکھ", "رکھ رہا": "رکھ", "رکھ چکا": "رکھ", "رکھ چکی": "رکھ",
    # Communication
    "پوچھا": "پوچھ", "پوچھتے": "پوچھ", "پوچھتی": "پوچھ", "پوچھ رہی": "پوچھ", "پوچھ رہا": "پوچھ", "پوچھ چکی": "پوچھ", "پوچھ چکا": "پوچھ",
    "بتایا": "بتا", "بتاتے": "بتا", "بتاتی": "بتا", "بتا رہی": "بتا", "بتا رہا": "بتا", "بتا چکی": "بتا", "بتا چکا": "بتا",
    "دکھایا": "دکھا", "دکھاتے": "دکھا", "دکھاتی": "دکھا", "دکھا رہی": "دکھا", "دکھا رہا": "دکھا", "دکھا چکی": "دکھا", "دکھا چکا": "دکھا",
    # Emotions
    "رویا": "رو", "روتے": "رو", "روتی": "رو", "رو رہا": "رو", "رو رہی": "رو", "رو چکی": "رو", "رو چکا": "رو",
    "ہنسا": "ہنس", "ہنستے": "ہنس", "ہنستی": "ہنس", "ہنس رہا": "ہنس", "ہنس رہی": "ہنس", "ہنس چکا": "ہنس", "ہنس چکی": "ہنس",
    "چاہا": "چاہ", "چاہتے": "چاہ", "چاہتی": "چاہ", "چاہ رہی": "چاہ", "چاہ رہا": "چاہ", "چاہ چکی": "چاہ", "چاہ چکا": "چاہ",
    # Perception
    "سمجھا": "سمجھ", "سمجھتی": "سمجھ", "سمجھتے": "سمجھ", "سمجھ رہی": "سمجھ", "سمجھ رہا": "سمجھ", "سمجھ چکا": "سمجھ", "سمجھ چکی": "سمجھ",
    "سیکھا": "سیکھ", "سیکھتے": "سیکھ", "سیکھتی": "سیکھ", "سیکھ رہا": "سیکھ", "سیکھ رہی": "سیکھ", "سیکھ چکا": "سیکھ", "سیکھ چکی": "سیکھ",
}

# Function to perform dictionary-based lemmatization
def urdu_lemmatizer(word):
    # Check the dictionary for known lemmatizations
    return urdu_lemmas.get(word, word)  # Return the base form if found, else return the word as-is

# Apply lemmatization to each word

# Apply lemmatization to each word in the text
def apply_lemmatization(text):
    words = text.split()  # Split the text into words
    lemmatized_words = [urdu_lemmatizer(word) for word in words]  # Apply lemmatization to each word
    return ' '.join(lemmatized_words)  # Join the lemmatized words back into a sentence

# Apply the lemmatization function to the dataset
df[first_column] = df[first_column].apply(lambda x: apply_lemmatization(str(x)))

# Display the updated dataset (first 5 rows)
print("\nLemmatized Dataset (first 5 rows):")
print(df.head())

# Save the lemmatized dataset to a new CSV file
output_file_path = 'lemmatized_urdu.csv'
df.to_csv(output_file_path, index=False, encoding='utf-8')



Lemmatized Dataset (first 5 rows):
                                           urdu_text  is_sarcastic  \
0               لینے میری شادی فسادن کوجی نہیں چاہیے           1.0   
1  چل مہمانوں میں کھانا سرو کر چڑیل چاچی نوں دسدی...           1.0   
2  کامران خان آپکی دن بھریہ زمہ داری لگائی اپوزیش...           0.0   
3           مراد علی شاہ بھیس میں ڈی جی ایس حامد میر           1.0   
4                            قابل اعتبار قاتل اعتبار           1.0   

   Unnamed: 2  Unnamed: 3  Unnamed: 4  Unnamed: 5 Unnamed: 6  Unnamed: 7  
0         NaN         NaN         NaN         NaN        NaN         NaN  
1         NaN         NaN         NaN         NaN        NaN         NaN  
2         NaN         NaN         NaN         NaN        NaN         NaN  
3         NaN         NaN         NaN         NaN        NaN         NaN  
4         NaN         NaN         NaN         NaN        NaN         NaN  


# Phase 3: Feature Extraction from Urdu Text
1. Tokenization:
o Implement word tokenization for Urdu text, ensuring that the Urdu script is
properly segmented into words. You can use existing tokenizers or build your
own.
o Deliverable: Provide a tokenized version of several Urdu social media posts.

In [17]:
import pandas as pd


file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')


first_column = df.columns[0]


def urdu_tokenizer(text):
    tokens = text.split() 
    unique_tokens = list(set(tokens))  
    return unique_tokens

df['tokenized_text'] = df[first_column].apply(lambda x: urdu_tokenizer(str(x)))

print("\nTokenized Dataset of each sentence (first 5 rows):")
print(df[['tokenized_text']].head())

output_file_path = 'tokenized_urdu.csv'
df.to_csv(output_file_path, index=False, encoding='utf-8')




Tokenized Dataset of each sentence (first 5 rows):
                                      tokenized_text
0       [میری, فسادن, چاہیے, لینے, نہیں, شادی, کوجی]
1  [کر, میں, چل, چڑیل, نوں, چاچی, سرو, دسدی, آں, ...
2  [ہےہمیں, فالوکریں, نوٹ, دن, پربھونکناہےآپ, اپو...
3  [حامد, مراد, میں, جی, ایس, شاہ, بھیس, علی, ڈی,...
4                               [قاتل, قابل, اعتبار]


In [18]:
from collections import Counter

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

def urdu_tokenizer(text):
    tokens = text.split()  
    return tokens

all_tokens = []

for text in df[first_column]:
    tokens = urdu_tokenizer(str(text))
    all_tokens.extend(tokens)  

token_frequency = Counter(all_tokens)

sorted_token_frequency = sorted(token_frequency.items(), key=lambda x: x[1], reverse=True)

print("\nTokenized Corpus and Frequency:")
for token, freq in sorted_token_frequency:
    print(f"Token: {token}, Frequency: {freq}")



Tokenized Corpus and Frequency:
Token: میں, Frequency: 7323
Token: کو, Frequency: 5640
Token: سے, Frequency: 5471
Token: کر, Frequency: 5252
Token: کا, Frequency: 4966
Token: نہیں, Frequency: 4445
Token: بھی, Frequency: 3890
Token: نے, Frequency: 3711
Token: اس, Frequency: 2161
Token: آپ, Frequency: 2105
Token: جا, Frequency: 1975
Token: وہ, Frequency: 1821
Token: جو, Frequency: 1389
Token: تھا, Frequency: 1359
Token: نہ, Frequency: 1341
Token: اب, Frequency: 1270
Token: خان, Frequency: 1161
Token: ھے, Frequency: 1147
Token: ہم, Frequency: 1028
Token: گا, Frequency: 1007
Token: اللہ, Frequency: 1006
Token: بات, Frequency: 989
Token: جی, Frequency: 961
Token: سب, Frequency: 957
Token: صاحب, Frequency: 951
Token: تم, Frequency: 887
Token: ان, Frequency: 854
Token: کسی, Frequency: 850
Token: رہا, Frequency: 832
Token: پاکستان, Frequency: 822
Token: بہت, Frequency: 816
Token: سندھ, Frequency: 811
Token: کرنے, Frequency: 794
Token: ا, Frequency: 793
Token: دیا, Frequency: 790
Token: کچھ, F

# 2. Tf-IDF (Term Frequency-Inverse Document Frequency):
o Apply the Tf-IDF algorithm to extract relevant terms from the dataset. Identify
the most important terms contributing to sentiment in Urdu text.

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

df[first_column] = df[first_column].apply(lambda x: ' '.join(str(x).split()))  

vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(df[first_column])

terms = vectorizer.get_feature_names_out()  
tfidf_scores = tfidf_matrix.toarray()  

average_tfidf_scores = tfidf_scores.mean(axis=0)

tfidf_dict = dict(zip(terms, average_tfidf_scores))

sorted_tfidf = sorted(tfidf_dict.items(), key=lambda x: x[1], reverse=True)

print("\nTop Terms Contributing to Sentiment (Based on TF-IDF):")
for term, score in sorted_tfidf[:10]:  
    print(f"Term: {term}, TF-IDF Score: {score}")



Top Terms Contributing to Sentiment (Based on TF-IDF):
Term: میں, TF-IDF Score: 0.032466776861929114
Term: کو, TF-IDF Score: 0.026662202116034036
Term: کر, TF-IDF Score: 0.026494329594945538
Term: سے, TF-IDF Score: 0.026292160665067003
Term: نہیں, TF-IDF Score: 0.025317272675427615
Term: کا, TF-IDF Score: 0.02416601795096775
Term: بھی, TF-IDF Score: 0.02256890342295227
Term: نے, TF-IDF Score: 0.02030662988915006
Term: آپ, TF-IDF Score: 0.015229593666332194
Term: اس, TF-IDF Score: 0.013776788556752777


# Word2Vec:
o Train a Word2Vec model on your dataset to capture the relationship between Urdu
words based on context.
o Deliverable: List the top 5 words most similar to the word &quot;اچھا&quot; (good) using the
trained model.

In [20]:
import gensim
from gensim.models import Word2Vec

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

tokenized_corpus = [str(text).split() for text in df[first_column]]

# Step 2: Train the Word2Vec model using the Skip-Gram approach (sg=1)
word2vec_model = Word2Vec(
    sentences=tokenized_corpus,  # Tokenized corpus
    vector_size=200,             # Increase the dimensionality of word vectors to capture more context
    window=4,                    # Context window size
    sg=1,                        # Skip-Gram (sg=1), use CBOW if sg=0
    min_count=5,                 # Exclude rare words (set a higher minimum count to remove noise)
    workers=4                    # Train for more epochs to allow the model to learn better
)

def find_similar_words(word, topn=5):
    try:
        similar_words = word2vec_model.wv.most_similar(word, topn=topn)
        print(f"\nTop {topn} words most similar to '{word}':")
        for similar_word, similarity in similar_words:
            print(f"Word: {similar_word}, Similarity: {similarity}")
    except KeyError:
        print(f"The word '{word}' is not in the vocabulary.")

# Example usage: Find the top 5 words similar to "اچھا"
find_similar_words("اچھا")



Top 5 words most similar to 'اچھا':
Word: شعر, Similarity: 0.8171318173408508
Word: بتانا, Similarity: 0.80801922082901
Word: پسند, Similarity: 0.8040664196014404
Word: حال, Similarity: 0.8029041290283203
Word: کتنا, Similarity: 0.8017340302467346


# Phase 4: N-grams Analysis
1. Unigram, Bigram, and Trigram Analysis:
o Create unigrams, bigrams, and trigrams from the dataset of Urdu text.
o Deliverable: List the top 10 most common bigrams and trigrams in the dataset,
along with their frequencies.
o Challenges to Address: Proper tokenization for Urdu to avoid breaking the
words incorrectly (due to right-to-left script).

In [21]:
#######################################    UNI-GRAM   #################################
from collections import Counter

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

all_unigrams = []

for text in df[first_column]:
    tokens = str(text).split()  
    all_unigrams.extend(tokens)  

unigram_frequency = Counter(all_unigrams)

sorted_unigrams = sorted(unigram_frequency.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 Most Frequent Unigrams:")
for unigram, freq in sorted_unigrams[:10]:
    print(f"Unigram: {unigram}, Frequency: {freq}")



Top 10 Most Frequent Unigrams:
Unigram: میں, Frequency: 7323
Unigram: کو, Frequency: 5640
Unigram: سے, Frequency: 5471
Unigram: کر, Frequency: 5252
Unigram: کا, Frequency: 4966
Unigram: نہیں, Frequency: 4445
Unigram: بھی, Frequency: 3890
Unigram: نے, Frequency: 3711
Unigram: اس, Frequency: 2161
Unigram: آپ, Frequency: 2105


In [23]:
###############################################   BI GRAMS   ################################


from collections import Counter
from nltk import ngrams  

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

all_bigrams = []

for text in df[first_column]:
    tokens = str(text).split()  # Tokenize the text into unigrams
    bigrams = list(ngrams(tokens, 2))  # Generate bigrams (n=2)
    all_bigrams.extend(bigrams)  # Add the bigrams to the list

bigram_frequency = Counter(all_bigrams)

sorted_bigrams = sorted(bigram_frequency.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 Most Frequent Bigrams:")
for bigram, freq in sorted_bigrams[:10]:
    print(f"Bigram: {bigram}, Frequency: {freq}")



Top 10 Most Frequent Bigrams:
Bigram: ('عمران', 'خان'), Frequency: 503
Bigram: ('نواز', 'شریف'), Frequency: 449
Bigram: ('t', 'co'), Frequency: 407
Bigram: ('میں', 'نے'), Frequency: 350
Bigram: ('آپ', 'کو'), Frequency: 314
Bigram: ('سندھ', 'پولیس'), Frequency: 296
Bigram: ('بھی', 'نہیں'), Frequency: 252
Bigram: ('آرمی', 'چیف'), Frequency: 224
Bigram: ('آپ', 'نے'), Frequency: 222
Bigram: ('میں', 'بھی'), Frequency: 213


In [24]:
##################################################  TRI GRAMS   ###########################################


from collections import Counter
from nltk import ngrams  

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]

all_trigrams = []

for text in df[first_column]:
    tokens = str(text).split()  # Tokenize the text into unigrams
    trigrams = list(ngrams(tokens, 3))  # Generate trigrams (n=3)
    all_trigrams.extend(trigrams)  # Add the trigrams to the list

trigram_frequency = Counter(all_trigrams)

sorted_trigrams = sorted(trigram_frequency.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 Most Frequent Trigrams:")
for trigram, freq in sorted_trigrams[:10]:
    print(f"Trigram: {trigram}, Frequency: {freq}")



Top 10 Most Frequent Trigrams:
Trigram: ('صلی', 'اللہ', 'علیہ'), Frequency: 90
Trigram: ('پی', 'ڈی', 'ایم'), Frequency: 88
Trigram: ('ووٹ', 'کو', 'عزت'), Frequency: 76
Trigram: ('علیہ', 'وآلہ', 'وسلم'), Frequency: 75
Trigram: ('اللہ', 'علیہ', 'وآلہ'), Frequency: 74
Trigram: ('جزاک', 'اللہ', 'خیر'), Frequency: 73
Trigram: ('عمران', 'خان', 'نے'), Frequency: 69
Trigram: ('والوں', 'کو', 'فالو'), Frequency: 68
Trigram: ('فالو', 'فالو', 'بیک'), Frequency: 67
Trigram: ('نے', 'جواب', 'دیا'), Frequency: 64


# Phase 5: Sentiment Classification Model
1. Model Building:
o Using the features extracted (from Tf-IDF or Word2Vec), build a machine
learning model (e.g., Logistic Regression, SVM, or Naive Bayes) to classify the
sentiment of the Urdu posts.
o Deliverable: Show the accuracy, precision, recall, and F1-score of your sentiment
classifier using a test set of Urdu text.

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

label_column = 'is_sarcastic'  
y = df[label_column] 

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

y_pred = naive_bayes_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("\nModel Performance Metrics (Naive Bayes - Sentiment Classification):")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))



Model Performance Metrics (Naive Bayes - Sentiment Classification):
Accuracy: 0.77
Precision: 0.78
Recall: 0.77
F1-Score: 0.76

Classification Report:
              precision    recall  f1-score   support

         0.0       0.85      0.61      0.71      1777
         1.0       0.72      0.91      0.80      2006

    accuracy                           0.77      3783
   macro avg       0.79      0.76      0.76      3783
weighted avg       0.78      0.77      0.76      3783



# Phase 6: Evaluation &amp; Optimization
1. Evaluation:
o Evaluate the model&#39;s performance on a validation set of Urdu posts. Analyze
where the model performs well and where it struggles (e.g., understanding
complex sentences or detecting sarcasm).
o Deliverable: Present evaluation metrics and discuss areas where improvements
can be made.

In [43]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

file_path = 'lemmatized_urdu.csv'  
df = pd.read_csv(file_path, encoding='utf-8')

first_column = df.columns[0]  

def urdu_tokenizer(text):
    tokens = text.split() 
    unique_tokens = list(set(tokens))  
    return unique_tokens

df['tokenized_text'] = df[first_column].apply(lambda x: ' '.join(urdu_tokenizer(str(x))))

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['tokenized_text'])

unseen_posts = [
    "آپ واقعی بہت زبردست ہیں۔",  
    "یہ دن تو بہت ہی شاندار گزرا!",  
    "مجھے تمہاری مدد کی بالکل بھی ضرورت نہیں۔",  
    "میں نے زندگی میں اتنی اچھی فلم پہلے کبھی نہیں دیکھی!",  
    "یہ واقعی ایک بہت اچھا فیصلہ تھا۔"
]

unseen_tokenized = [' '.join(urdu_tokenizer(text)) for text in unseen_posts]

unseen_tfidf = vectorizer.transform(unseen_tokenized)

X_train = tfidf_matrix
y_train = df['is_sarcastic']  

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred_unseen = model.predict(unseen_tfidf)

true_labels = [0, 0, 0, 1, 0]  

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, y_pred_unseen)
precision = precision_score(true_labels, y_pred_unseen, average='weighted')
recall = recall_score(true_labels, y_pred_unseen, average='weighted')
f1 = f1_score(true_labels, y_pred_unseen, average='weighted')

# Print evaluation metrics
print("Classification Report:\n", classification_report(true_labels, y_pred_unseen))
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Classification Report:
               precision    recall  f1-score   support

           0       0.80      1.00      0.89         4
           1       0.00      0.00      0.00         1

    accuracy                           0.80         5
   macro avg       0.40      0.50      0.44         5
weighted avg       0.64      0.80      0.71         5

Accuracy: 0.8000
Precision: 0.6400
Recall: 0.8000
F1-Score: 0.7111


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
