In [1]:
import os
import pandas as pd

# Base path to the datasets
path = r"C:\Users\Hp\Desktop\Urdu NEWS dataset"

# List to store the data
data = []

# Iterate over the dataset categories
for dataset in ['bbc dataset', 'voa dataset']:
    # Iterate over the news categories
    for category in ['entertainment', 'miscellaneous', 'politics', 'sports']:
        # Construct the full directory path
        dir_path = os.path.join(path, dataset, category)
        # List files and directories in the current directory
        try:
            dir_list = os.listdir(dir_path)
            # Append each file/directory with its dataset and category
            for item in dir_list:
                # Remove the .doc extension
                file_name, file_extension = os.path.splitext(item)
                if file_extension in [".doc" , ".docx" ]:
                    item = file_name
                data.append({
                    'Dataset': dataset,
                    'Category': category,
                    'HeadLines': item
                })
        except FileNotFoundError:
            print(f"Directory not found: {dir_path}")
        except Exception as e:
            print(f"Error accessing {dir_path}: {e}")

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Display the DataFrame
df


Directory not found: C:\Users\Hp\Desktop\Urdu NEWS dataset\bbc dataset\miscellaneous
Directory not found: C:\Users\Hp\Desktop\Urdu NEWS dataset\voa dataset\miscellaneous


Unnamed: 0,Dataset,Category,HeadLines
0,bbc dataset,entertainment,'زیادہ پانی پینا بھی خطرناک ہو سکتا ہے'
1,bbc dataset,entertainment,'شاہ رخ سے رومانس نہ کرنے کا ملال نہیں'
2,bbc dataset,entertainment,'مومنہ ایک سیلفی پلیز'
3,bbc dataset,entertainment,'پرینکا مس ورلڈ بنیں تو میں سکول میں تھی'
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون کی پروڈکشن بند'
...,...,...,...
2092,voa dataset,sports,پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑی...
2093,voa dataset,sports,کرکٹ پاکستان اور افغانستان کو قریب لا سکتا ہے...
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز کی مسلسل دوسری کامیابی
2095,voa dataset,sports,یاسر شاہ کی عمدہ باؤلنگ، پاکستان دوسرے ٹیسٹ می...


In [46]:
#pip install lughaatNLP

In [2]:
from LughaatNLP import NER_Urdu
from LughaatNLP import POS_urdu
from LughaatNLP import LughaatNLP
urdu_text_processing = LughaatNLP()
pos_tagger = POS_urdu()
ner_urdu = NER_Urdu()





### normalize(text): 
Performs all-in-one normalization on the Urdu text, including character normalization, diacritic removal, punctuation handling, digit conversion, and special character preservation.

In [3]:
text = "آپ کیسے ہیں؟ میں 23 سال کا ہوں۔"
normalized_text = urdu_text_processing.normalize(text)
print(normalized_text)

اپ کیسے ہیں ؟ میں ۲۳ سال کا ہوں ۔


In [49]:
df['HeadLines'] = df['HeadLines'].apply(urdu_text_processing.normalize)
df

Unnamed: 0,Dataset,Category,HeadLines
0,bbc dataset,entertainment,'زیادہ پانی پینا بھی خطرناک ہو سکتا ہے'
1,bbc dataset,entertainment,'شاہ رخ سے رومانس نہ کرنے کا ملال نہیں'
2,bbc dataset,entertainment,'مومنہ ایک سیلفی پلیز'
3,bbc dataset,entertainment,'پرینکا مس ورلڈ بنیں تو میں سکول میں تھی'
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون کی پروڈکشن بند'
...,...,...,...
2092,voa dataset,sports,پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑی...
2093,voa dataset,sports,کرکٹ پاکستان اور افغانستان کو قریب لا سکتا ہے ...
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز کی مسلسل دوسری کامیابی
2095,voa dataset,sports,یاسر شاہ کی عمدہ باؤلنگ، پاکستان دوسرے ٹیسٹ می...


## Stop Words Removing
Stop words are common words in a language (such as “کہ”, “کیا”, “اور”, “لیکن”, “بھی”) that are often filtered out during text processing or analysis because they are considered irrelevant for tasks like searching or natural language understanding in Urdu language.

This function removes stopwords from the Urdu text.

In [50]:
text = "میں اس کتاب کو پڑھنا چاہتا ہوں۔"
filtered_text = urdu_text_processing.remove_stopwords(text)
print(filtered_text)

کتاب پڑھنا چاہتا ۔


In [51]:
# df['HeadLines'] = df['HeadLines'].apply(urdu_text_processing.remove_stopwords)
df

Unnamed: 0,Dataset,Category,HeadLines
0,bbc dataset,entertainment,'زیادہ پانی پینا بھی خطرناک ہو سکتا ہے'
1,bbc dataset,entertainment,'شاہ رخ سے رومانس نہ کرنے کا ملال نہیں'
2,bbc dataset,entertainment,'مومنہ ایک سیلفی پلیز'
3,bbc dataset,entertainment,'پرینکا مس ورلڈ بنیں تو میں سکول میں تھی'
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون کی پروڈکشن بند'
...,...,...,...
2092,voa dataset,sports,پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑی...
2093,voa dataset,sports,کرکٹ پاکستان اور افغانستان کو قریب لا سکتا ہے ...
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز کی مسلسل دوسری کامیابی
2095,voa dataset,sports,یاسر شاہ کی عمدہ باؤلنگ، پاکستان دوسرے ٹیسٹ می...


In [52]:
df['HeadLines'].iloc[2092]

'پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑیوں سے پوچھ گچھ'

## Part of Speech
The pos_tags_urdu function is used for part-of-speech tagging in Urdu text. It takes an Urdu sentence as input and returns a list of dictionaries where each word is paired with its assigned part-of-speech tag, such as nouns (NN), verbs (VB), adjectives (ADJ), etc.

This Function will return dictionary words with their corresponding tags of Part of Speech

In [3]:
sentence = 'پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑیوں سے پوچھ گچھ'
word_tag_dict = ner_urdu.ner_tags_urdu(sentence)
word_tag_dict




{'پی': 'B-ORGANIZATION',
 'ایس': 'I-ORGANIZATION',
 'ایل': 'L-ORGANIZATION',
 'اینٹی': 'O',
 'کرپشن': 'O',
 'یونٹ': 'O',
 'کی': 'O',
 'مزید': 'O',
 'تین': 'U-NUMBER',
 'کھلاڑیوں': 'O',
 'سے': 'O',
 'پوچھ': 'O',
 'گچھ': 'O'}

In [8]:
spell_checker = LughaatNLP()
sentence = 'پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑیوں سے پوچھ گچھ'
tokens = urdu_text_processing.urdu_tokenize(sentence)
print("Tokenization for Urdu language:", tokens)  # Output: ['میں', 'پاکستان', 'سے', 'ہوں۔']
l = []
for i in tokens:
    similar_words_with_percentage = spell_checker.get_similar_words_percentage(i, 70)
    print("This will return the most similar words in list with percentage", similar_words_with_percentage)
    
    for i in similar_words_with_percentage:
        l.append(i[0])

print(l)

Tokenization for Urdu language: ['پی', 'ایس', 'ایل', 'اینٹی', 'کرپشن', 'یونٹ', 'کی', 'مزید', 'تین', 'کھلاڑیوں', 'سے', 'پوچھ', 'گچھ']
This will return the most similar words in list with percentage [('پی', 100.0)]
This will return the most similar words in list with percentage [('ایس', 100.0), ('ایسا', 75.0), ('ایسے', 75.0), ('ایسی', 75.0), ('ایکس', 75.0), ('ایسڈ', 75.0), ('ایپس', 75.0), ('ایلس', 75.0), ('ایسٹ', 75.0), ('انیس', 75.0), ('اسیس', 75.0), ('ایمس', 75.0), ('ایٹس', 75.0), ('دایس', 75.0), ('اکیس', 75.0), ('ایرس', 75.0), ('ایسز', 75.0)]
This will return the most similar words in list with percentage [('ایل', 100.0), ('اپیل', 75.0), ('ایلن', 75.0), ('ایپل', 75.0), ('انیل', 75.0), ('ایلس', 75.0), ('ایبل', 75.0), ('ایلی', 75.0), ('ایگل', 75.0), ('ایفل', 75.0), ('ایول', 75.0), ('دایل', 75.0), ('ایمل', 75.0), ('ایڈل', 75.0), ('ایلک', 75.0), ('ایلٹ', 75.0)]
This will return the most similar words in list with percentage [('اینٹی', 100.0), ('اینٹیں', 83.33333333333334), ('ایٹی', 80.0),

In [55]:
from LughaatNLP import POS_urdu

pos_tagger = POS_urdu()

sentence = "میرے والدین نے میری تعلیم اور تربیت میں بہت محنت کی تاکہ میں اپنی زندگی میں کامیاب ہو سکوں۔"
predicted_pos_tags = pos_tagger.pos_tags_urdu('پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑیوں سے پوچھ گچھ')
predicted_pos_tags




[{'Word': 'پی', 'POS_Tag': 'PN'},
 {'Word': 'ایس', 'POS_Tag': 'PN'},
 {'Word': 'ایل', 'POS_Tag': 'PN'},
 {'Word': 'اینٹی', 'POS_Tag': 'PN'},
 {'Word': 'کرپشن', 'POS_Tag': 'PN'},
 {'Word': 'یونٹ', 'POS_Tag': 'NN'},
 {'Word': 'کی', 'POS_Tag': 'P'},
 {'Word': 'مزید', 'POS_Tag': 'ADJ'},
 {'Word': 'تین', 'POS_Tag': 'CA'},
 {'Word': 'کھلاڑیوں', 'POS_Tag': 'NN'},
 {'Word': 'سے', 'POS_Tag': 'SE'},
 {'Word': 'پوچھ', 'POS_Tag': 'VB'},
 {'Word': 'گچھ', 'POS_Tag': 'PN'}]

In [56]:
# Initialize the TfidfVectorizer

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(df['HeadLines'])


In [57]:
# Get feature names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

# Display the DataFrame
tfidf_df


Unnamed: 0,deep,negative,positive,state,words,أفغان,أفغانستان,أوباما,ئل,ئٹہ,...,۷۶,۷۷,۸۰,۸۰۰,۸۶,۸۷,۹۰,۹۰۰,۹۱,۹۲
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
