# Urdu News Clustering Using Headlines 

In [2]:
import os
import pandas as pd

# Base path to the datasets
path = r"C:\Users\Hp\Desktop\Urdu NEWS dataset"

# List to store the data
data = []

# Iterate over the dataset categories
for dataset in ['bbc dataset', 'voa dataset']:
    # Iterate over the news categories
    for category in ['entertainment', 'miscellaneous', 'politics', 'sports']:
        # Construct the full directory path
        dir_path = os.path.join(path, dataset, category)
        # List files and directories in the current directory
        try:
            dir_list = os.listdir(dir_path)
            # Append each file/directory with its dataset and category
            for item in dir_list:
                # Remove the .doc extension
                file_name, file_extension = os.path.splitext(item)
                if file_extension in [".doc" , ".docx" ]:
                    item = file_name
                data.append({
                    'Dataset': dataset,
                    'Category': category,
                    'HeadLines': item
                })
        except FileNotFoundError:
            print(f"Directory not found: {dir_path}")
        except Exception as e:
            print(f"Error accessing {dir_path}: {e}")

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Display the DataFrame

df


Directory not found: C:\Users\Hp\Desktop\Urdu NEWS dataset\bbc dataset\miscellaneous
Directory not found: C:\Users\Hp\Desktop\Urdu NEWS dataset\voa dataset\miscellaneous


Unnamed: 0,Dataset,Category,HeadLines
0,bbc dataset,entertainment,'زیادہ پانی پینا بھی خطرناک ہو سکتا ہے'
1,bbc dataset,entertainment,'شاہ رخ سے رومانس نہ کرنے کا ملال نہیں'
2,bbc dataset,entertainment,'مومنہ ایک سیلفی پلیز'
3,bbc dataset,entertainment,'پرینکا مس ورلڈ بنیں تو میں سکول میں تھی'
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون کی پروڈکشن بند'
...,...,...,...
2092,voa dataset,sports,پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑی...
2093,voa dataset,sports,کرکٹ پاکستان اور افغانستان کو قریب لا سکتا ہے...
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز کی مسلسل دوسری کامیابی
2095,voa dataset,sports,یاسر شاہ کی عمدہ باؤلنگ، پاکستان دوسرے ٹیسٹ می...


In [3]:
df.to_csv('Urdu News Headlines dataset.csv')

In [97]:
#pip install lughaatNLP

## Import

In [98]:
# import pandas as pd
# df = pd.read_csv('/kaggle/input/urdu-news-dataset/Urdu NEWS dataset.csv')
# df

In [99]:
from LughaatNLP import NER_Urdu
from LughaatNLP import POS_urdu
from LughaatNLP import LughaatNLP
urdu_text_processing = LughaatNLP()
pos_tagger = POS_urdu()
ner_urdu = NER_Urdu()

### normalize(text): 
Performs all-in-one normalization on the Urdu text, including character normalization, diacritic removal, punctuation handling, digit conversion, and special character preservation.

In [100]:
text = "آپ کیسے ہیں؟ میں 23 سال کا ہوں۔"
normalized_text = urdu_text_processing.normalize(text)
print(normalized_text)

اپ کیسے ہیں ؟ میں ۲۳ سال کا ہوں ۔


In [101]:
df['HeadLines'] = df['HeadLines'].apply(urdu_text_processing.normalize)
df

Unnamed: 0,Dataset,Category,HeadLines
0,bbc dataset,entertainment,'زیادہ پانی پینا بھی خطرناک ہو سکتا ہے'
1,bbc dataset,entertainment,'شاہ رخ سے رومانس نہ کرنے کا ملال نہیں'
2,bbc dataset,entertainment,'مومنہ ایک سیلفی پلیز'
3,bbc dataset,entertainment,'پرینکا مس ورلڈ بنیں تو میں سکول میں تھی'
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون کی پروڈکشن بند'
...,...,...,...
2092,voa dataset,sports,پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑی...
2093,voa dataset,sports,کرکٹ پاکستان اور افغانستان کو قریب لا سکتا ہے ...
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز کی مسلسل دوسری کامیابی
2095,voa dataset,sports,یاسر شاہ کی عمدہ باؤلنگ، پاکستان دوسرے ٹیسٹ می...


## Stop Words Removing
Stop words are common words in a language (such as “کہ”, “کیا”, “اور”, “لیکن”, “بھی”) that are often filtered out during text processing or analysis because they are considered irrelevant for tasks like searching or natural language understanding in Urdu language.

This function removes stopwords from the Urdu text.

In [102]:
text = "میں اس کتاب کو پڑھنا چاہتا ہوں۔"
filtered_text = urdu_text_processing.remove_stopwords(text)
print(filtered_text)

کتاب پڑھنا چاہتا ۔


In [103]:
df['HeadLines'] = df['HeadLines'].apply(urdu_text_processing.remove_stopwords)
df

Unnamed: 0,Dataset,Category,HeadLines
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہے'
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں'
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز'
3,bbc dataset,entertainment,'پرینکا ورلڈ بنیں سکول تھی'
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند'
...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیوں پوچھ گچھ
2093,voa dataset,sports,کرکٹ پاکستان افغانستان قریب لا افریدی
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی
2095,voa dataset,sports,یاسر شاہ عمدہ باؤلنگ، پاکستان ٹیسٹ فاتح


## Part of Speech
The pos_tags_urdu function is used for part-of-speech tagging in Urdu text. It takes an Urdu sentence as input and returns a list of dictionaries where each word is paired with its assigned part-of-speech tag, such as nouns (NN), verbs (VB), adjectives (ADJ), etc.

This Function will return dictionary words with their corresponding tags of Part of Speech

In [104]:
from LughaatNLP import POS_urdu

pos_tagger = POS_urdu()

sentence = "میرے والدین نے میری تعلیم اور تربیت میں بہت محنت کی تاکہ میں اپنی زندگی میں کامیاب ہو سکوں۔"
predicted_pos_tags = pos_tagger.pos_tags_urdu('پی ایس ایل اینٹی کرپشن یونٹ کی مزید تین کھلاڑیوں سے پوچھ گچھ')
predicted_pos_tags




[{'Word': 'پی', 'POS_Tag': 'PN'},
 {'Word': 'ایس', 'POS_Tag': 'PN'},
 {'Word': 'ایل', 'POS_Tag': 'PN'},
 {'Word': 'اینٹی', 'POS_Tag': 'PN'},
 {'Word': 'کرپشن', 'POS_Tag': 'PN'},
 {'Word': 'یونٹ', 'POS_Tag': 'NN'},
 {'Word': 'کی', 'POS_Tag': 'P'},
 {'Word': 'مزید', 'POS_Tag': 'ADJ'},
 {'Word': 'تین', 'POS_Tag': 'CA'},
 {'Word': 'کھلاڑیوں', 'POS_Tag': 'NN'},
 {'Word': 'سے', 'POS_Tag': 'SE'},
 {'Word': 'پوچھ', 'POS_Tag': 'VB'},
 {'Word': 'گچھ', 'POS_Tag': 'PN'}]

## Name Entity Relation
The ner_tags_urdu function performs named entity recognition on Urdu text, assigning named entity tags (such as U-LOCATION for locations) to identified entities in the input sentence. It outputs a dictionary where words are mapped to their corresponding named entity tags, facilitating tasks like information extraction and text analysis specific to Urdu language.

In [105]:
from LughaatNLP import NER_Urdu
# For Name Entity Relation
ner_urdu = NER_Urdu()

sentence = "اس کتاب میں پاکستان کی تاریخ بیان کی گئی ہے۔"
word_tag_dict = ner_urdu.ner_tags_urdu(sentence)
word_tag_dict




{'پی': 'B-ORGANIZATION',
 'ایل': 'L-ORGANIZATION',
 'اینٹی': 'O',
 'کرپشن': 'O',
 'یونٹ': 'O',
 'تین': 'U-NUMBER',
 'کھلاڑیوں': 'O',
 'پوچھ': 'O',
 'گچھ': 'O'}

## Finding Named Entities

In [108]:
import pandas as pd
from LughaatNLP import POS_urdu

pos_tagger = POS_urdu()

def find_named_entity_relations(pos_tags):
    named_entities = []

    # Step 1: Identify potential named entities
    for token in pos_tags:
        if token['POS_Tag'] in ['NN', 'NNP']:  # Assuming NNP for proper nouns if available
            named_entities.append(token['Word'])
    return named_entities

# Function to process each text and extract entities and relations
def process_text(text):
    pos_tags = pos_tagger.pos_tags_urdu(text)
    named_entities = find_named_entity_relations(pos_tags)
    return named_entities

# Apply the function to each text in the DataFrame
df['entities'] = df['HeadLines'].apply(lambda x: process_text(x))

# Display the DataFrame with new columns
df




Unnamed: 0,Dataset,Category,HeadLines,entities
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہے',[پانی]
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں',"[رخ, ملال]"
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز',[]
3,bbc dataset,entertainment,'پرینکا ورلڈ بنیں سکول تھی',[سکول]
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند',"[گیلیکسی, نوٹ, پروڈکشن]"
...,...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیوں پوچھ گچھ,"[یونٹ, کھلاڑیوں]"
2093,voa dataset,sports,کرکٹ پاکستان افغانستان قریب لا افریدی,[]
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی,[کامیابی]
2095,voa dataset,sports,یاسر شاہ عمدہ باؤلنگ، پاکستان ٹیسٹ فاتح,"[ٹیسٹ, فاتح]"


In [112]:
max_entities = df['entities'].apply(len).max()

# Create new columns for each entity
for i in range(max_entities):
    df[f'entity_{i+1}'] = df['entities'].apply(lambda x: x[i] if i < len(x) else None)

# Drop the original 'entities' column if no longer needed
df.drop(columns=['entities'], inplace=True)

# Display the DataFrame
df

Unnamed: 0,Dataset,Category,HeadLines,entity_1,entity_2,entity_3,entity_4,entity_5,entity_6,entity_7,entity_8,entity_9,entity_10
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہے',پانی,,,,,,,,,
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں',رخ,ملال,,,,,,,,
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز',,,,,,,,,,
3,bbc dataset,entertainment,'پرینکا ورلڈ بنیں سکول تھی',سکول,,,,,,,,,
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند',گیلیکسی,نوٹ,پروڈکشن,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیوں پوچھ گچھ,یونٹ,کھلاڑیوں,,,,,,,,
2093,voa dataset,sports,کرکٹ پاکستان افغانستان قریب لا افریدی,,,,,,,,,,
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی,کامیابی,,,,,,,,,
2095,voa dataset,sports,یاسر شاہ عمدہ باؤلنگ، پاکستان ٹیسٹ فاتح,ٹیسٹ,فاتح,,,,,,,,


## Finding Locations

In [114]:
def find_location(text):
    ner_tags = ner_urdu.ner_tags_urdu(text)  # Assuming this function exists
    location = None
    for word, tag in ner_tags.items():
        if tag == 'B-LOCATION' or tag == 'L-LOCATION' or tag == 'U-LOCATION':
            location = word
            break
    return location

# Apply the function to each text in the DataFrame
df['location'] = df['HeadLines'].apply(find_location)

# Display the DataFrame with the new 'location' column
df




Unnamed: 0,Dataset,Category,HeadLines,entity_1,entity_2,entity_3,entity_4,entity_5,entity_6,entity_7,entity_8,entity_9,entity_10,location
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہے',پانی,,,,,,,,,,
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں',رخ,ملال,,,,,,,,,
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز',,,,,,,,,,,
3,bbc dataset,entertainment,'پرینکا ورلڈ بنیں سکول تھی',سکول,,,,,,,,,,
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند',گیلیکسی,نوٹ,پروڈکشن,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیوں پوچھ گچھ,یونٹ,کھلاڑیوں,,,,,,,,,
2093,voa dataset,sports,کرکٹ پاکستان افغانستان قریب لا افریدی,,,,,,,,,,,پاکستان
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی,کامیابی,,,,,,,,,,کوئٹہ
2095,voa dataset,sports,یاسر شاہ عمدہ باؤلنگ، پاکستان ٹیسٹ فاتح,ٹیسٹ,فاتح,,,,,,,,,پاکستان


In [117]:
df.isna().sum()

Dataset         0
Category        0
HeadLines       0
entity_1      133
entity_2      432
entity_3      904
entity_4     1375
entity_5     1774
entity_6     1969
entity_7     2051
entity_8     2084
entity_9     2092
entity_10    2096
location     1374
dtype: int64

In [119]:
df.drop(['entity_3',
       'entity_4', 'entity_5', 'entity_6', 'entity_7', 'entity_8', 'entity_9',
       'entity_10'] , axis=1 , inplace=True)

df

Unnamed: 0,Dataset,Category,HeadLines,entity_1,entity_2,location
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہے',پانی,,
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں',رخ,ملال,
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز',,,
3,bbc dataset,entertainment,'پرینکا ورلڈ بنیں سکول تھی',سکول,,
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند',گیلیکسی,نوٹ,
...,...,...,...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیوں پوچھ گچھ,یونٹ,کھلاڑیوں,
2093,voa dataset,sports,کرکٹ پاکستان افغانستان قریب لا افریدی,,,پاکستان
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی,کامیابی,,کوئٹہ
2095,voa dataset,sports,یاسر شاہ عمدہ باؤلنگ، پاکستان ٹیسٹ فاتح,ٹیسٹ,فاتح,پاکستان


## Stemming 

In [120]:
stemmed_headlines = []
for headline in df['HeadLines']:
    stemmed_headlines.append(urdu_text_processing.urdu_stemmer(headline))

df['HeadLines'] = stemmed_headlines

df

Unnamed: 0,Dataset,Category,HeadLines,entity_1,entity_2,location
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہہ',پانی,,
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں',رخ,ملال,
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز',,,
3,bbc dataset,entertainment,'پرینکا ورلڈ بنا سکول تھی',سکول,,
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند',گیلیکسی,نوٹ,
...,...,...,...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیا پوچھ گچھ,یونٹ,کھلاڑیوں,
2093,voa dataset,sports,کرکٹ پاکستن افغانستن قریب لا افریدی,,,پاکستان
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی,کامیابی,,کوئٹہ
2095,voa dataset,sports,یاسر شاہ عمدہ بؤلنگ، پاکستن ٹیسٹ فاتح,ٹیسٹ,فاتح,پاکستان


## TF-IDF Vectorizer

In [121]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()

tfidf_matrix = tfidf_vectorizer.fit_transform(df['HeadLines'])


In [122]:
feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

tfidf_df


Unnamed: 0,deep,negative,positive,state,words,أفغان,أفغانستن,أوبما,ئا,ئل,...,۷۶,۷۷,۸۰,۸۰۰,۸۶,۸۷,۹۰,۹۰۰,۹۱,۹۲
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2093,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2094,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
df = pd.concat([df,tfidf_df] , axis=1)

In [124]:
df

Unnamed: 0,Dataset,Category,HeadLines,entity_1,entity_2,location,deep,negative,positive,state,...,۷۶,۷۷,۸۰,۸۰۰,۸۶,۸۷,۹۰,۹۰۰,۹۱,۹۲
0,bbc dataset,entertainment,'زیادہ پانی پینا خطرناک ہہ',پانی,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,bbc dataset,entertainment,'شاہ رخ رومانس ملال نہیں',رخ,ملال,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,bbc dataset,entertainment,'مومنہ سیلفی پلیز',,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,bbc dataset,entertainment,'پرینکا ورلڈ بنا سکول تھی',سکول,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,bbc dataset,entertainment,'گیلیکسی نوٹ سیون پروڈکشن بند',گیلیکسی,نوٹ,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,voa dataset,sports,پی ایل اینٹی کرپشن یونٹ تین کھلاڑیا پوچھ گچھ,یونٹ,کھلاڑیوں,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2093,voa dataset,sports,کرکٹ پاکستن افغانستن قریب لا افریدی,,,پاکستان,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2094,voa dataset,sports,کوئٹہ گلیڈی ایٹرز مسلسل کامیابی,کامیابی,,کوئٹہ,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2095,voa dataset,sports,یاسر شاہ عمدہ بؤلنگ، پاکستن ٹیسٹ فاتح,ٹیسٹ,فاتح,پاکستان,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
df.drop(['HeadLines', 'Dataset'] , axis=1 , inplace=True)
df

Unnamed: 0,Category,entity_1,entity_2,location,deep,negative,positive,state,words,أفغان,...,۷۶,۷۷,۸۰,۸۰۰,۸۶,۸۷,۹۰,۹۰۰,۹۱,۹۲
0,entertainment,پانی,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,entertainment,رخ,ملال,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,entertainment,,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,entertainment,سکول,,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,entertainment,گیلیکسی,نوٹ,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,sports,یونٹ,کھلاڑیوں,,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2093,sports,,,پاکستان,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2094,sports,کامیابی,,کوئٹہ,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2095,sports,ٹیسٹ,فاتح,پاکستان,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [127]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in ['Category', 'entity_1', 'entity_2', 'location']:
    df[i] = le.fit_transform(df[i])

df

Unnamed: 0,Category,entity_1,entity_2,location,deep,negative,positive,state,words,أفغان,...,۷۶,۷۷,۸۰,۸۰۰,۸۶,۸۷,۹۰,۹۰۰,۹۱,۹۲
0,0,719,885,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,329,616,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,884,885,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,408,885,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,856,664,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,2,875,818,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2093,2,884,885,70,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2094,2,782,885,81,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2095,2,700,483,70,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Models
   

## Kmeans 

In [166]:
from sklearn.cluster import KMeans, DBSCAN

kmeans = KMeans(n_clusters=10, random_state=42)
kmeans.fit(df)
df['kmeans_labels'] = kmeans.labels_





In [167]:
df

Unnamed: 0,Category,entity_1,entity_2,location,deep,negative,positive,state,words,أفغان,...,۷۷,۸۰,۸۰۰,۸۶,۸۷,۹۰,۹۰۰,۹۱,۹۲,kmeans_labels
0,0,719,885,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
1,0,329,616,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8
2,0,884,885,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
3,0,408,885,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0,856,664,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2092,2,875,818,88,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2093,2,884,885,70,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2094,2,782,885,81,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9
2095,2,700,483,70,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2


In [168]:
from sklearn import metrics
import numpy as np

kmeans_silhouette_score = metrics.silhouette_score(df, kmeans.labels_)
print("KMeans Silhouette Score:", kmeans_silhouette_score)

KMeans Silhouette Score: 0.38559258967978227


# Get Similar News

In [162]:
spell_checker = LughaatNLP()
sentence = 'آگ لگ جائے تو کیا کریں'


tokens = urdu_text_processing.urdu_tokenize(sentence)
print("Tokenization for Urdu language:", tokens)  
l = []
for i in tokens:
    similar_words_with_percentage = spell_checker.get_similar_words_percentage(i, 60)
    print("This will return the most similar words in list with percentage", similar_words_with_percentage)
    
    for i in similar_words_with_percentage:
        l.append(i[0])

print(l)

Tokenization for Urdu language: ['آگ', 'لگ', 'جائے', 'تو', 'کیا', 'کریں']
This will return the most similar words in list with percentage [('آگ', 100.0), ('آگے', 66.66666666666666), ('آرگ', 66.66666666666666), ('آگا', 66.66666666666666)]
This will return the most similar words in list with percentage [('لگ', 100.0), ('لوگ', 66.66666666666666), ('لگا', 66.66666666666666), ('الگ', 66.66666666666666), ('لیگ', 66.66666666666666), ('لگے', 66.66666666666666), ('لگی', 66.66666666666666), ('لاگ', 66.66666666666666), ('لگن', 66.66666666666666), ('پلگ', 66.66666666666666)]
This will return the most similar words in list with percentage [('جائے', 100.0), ('بجائے', 80.0), ('جائزے', 80.0), ('آجائے', 80.0), ('جلائے', 80.0), ('جانئے', 80.0), ('جوائے', 80.0), ('جمائے', 80.0), ('جانے', 75.0), ('رائے', 75.0), ('جاتے', 75.0), ('پائے', 75.0), ('چائے', 75.0), ('ہائے', 75.0), ('گائے', 75.0), ('لائے', 75.0), ('جائز', 75.0), ('جوئے', 75.0), ('سائے', 75.0), ('جئے', 75.0), ('جاکے', 75.0), ('جائن', 75.0), ('جائی

In [163]:
head = []
for i in data:
    head.append(i['HeadLines'])
# head

score = []


for j in head:
    s = 0
    tokens = urdu_text_processing.urdu_tokenize(j)
    for t in tokens:
        if t in l:
            s = s+1 

    score.append(s)

top_indices = sorted(range(len(score)), key=lambda i: score[i], reverse=True)[:3]

for i in top_indices:
    print("max score:", head[i] , 'score:' , score[i] )



max score: ایم کیو ایم کی رجسٹریشن کی منسوخی کی درخواست پر جواب طلب score: 4
max score: بلوچستان کی صوبائی حکومت کا پولیس ٹریننگ سینٹر کی ناقص سکیورٹی کا اعتراف score: 4
max score: صدارتی انتخاب کے نتائج کا اعلان، کہیں جشن تو کہیں آنسو score: 4
