# NEW TITLE CLUSTERING

## Project Flow

* Text Data Cleaning
* Feature Extraction by Count Vectorizer, TFIDF, BERT
* Clustering Algorithm - Kmeans and Spectral Clustering
* Evaluation Metrics - Silhouette Score



In [174]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Text Pre-processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from collections import Counter

# Feature Extraction
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

import nltk
nltk.download('stopwords')
wordnet= WordNetLemmatizer()
stop_word = stopwords.words("english")
nlp= spacy.load('en_core_web_sm')
from nltk.cluster import KMeansClusterer

from sklearn.cluster import MiniBatchKMeans,KMeans,SpectralClustering
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

#evaluation
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [165]:
# text data
data ={"PM Has Tremendous Vigour: Shashi Tharoor Credits Him For UP Polls Win",
       "You have seen Priyanka Gandhi everywhere’: Shashi Tharoor on Congress performance in 5 state elections",
      "Shashi Tharoor explains why Congress is worth reviving as crucial meet is on",
       "Bhagwant Mann's Swearing-In Ceremony LIVE | Punjab's New Chief Minister | India Today LIVE",
       "Bhagwant Mann swearing-in: ‘He was always in my prayers, will always be..,’ says ex-wife; children reach India to attend ceremony",
       "Punjab CM swearing-in LIVE updates: Bhagwant Mann sworn in as chief minister",
       "Bhagwant Mann Swearing-In Live: AAP Leader Takes Oath As Punjab Chief Minister",
       "Govt sounds Covid alert, calls for strict vigil as cases rise elsewhere",
       "Centre Rings Alarm Amid Covid Comeback in China, Calls for Genome Sequencing, Surveillance At High-Level Meet",
       "Covid-19: Mansukh Mandaviya tells officials to stay alert, enhance surveillance",
       "Fourth wave of Covid-19 in China, Hong Kong raises alarm bells for India, check more"
      }

## Count Vectorizer + Kmeans

In [159]:
def Similarity(text):
    
    text = set(text)

    df= pd.DataFrame(data=text)
    df=df.rename(columns={0:"text"})
    
    corpus = []

    for i in range (0, len (df)):
        # remvome puntuation 
        clean_text = re.sub('[^a-zA-Z]',' ',df["text"][i])

        # lower case 
        clean_text = clean_text.lower()

        # split into token
        clean_text = clean_text.split()

        # join the text for leemitization
        clean_text = nlp(" ".join(clean_text))
        # lemmitization
        clean_text = [token.lemma_ for token in clean_text]

        #stop word removal
        clean_text = [word for word in clean_text if word not in stop_word]

        # Joining the words in sentences
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    
    # Feature Extraction by TFIDF
    count = CountVectorizer()
    corpus_tfidf= count.fit_transform(corpus).toarray()
    
    sil_score_max = -1 #this is the minimum possible score

    #to get the best clusters 
    for n_clusters in range(2,len(df)):

        cls= KMeans(n_clusters= n_clusters,random_state=7)

        cls.fit_transform(corpus_tfidf)

        labels = cls.predict(corpus_tfidf)

        sil_score = silhouette_score(corpus_tfidf,labels=labels)

        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_cluster = n_clusters
    
    # use the best cluster number for Kmean model
    cls = KMeans(n_clusters=best_cluster,random_state=7)
    
    cls.fit_transform(corpus_tfidf)
    
    cls.predict(corpus_tfidf)
    
    # labels for the clusters 
    df['Cluster']= cls.labels_
    
    return df.sort_values(by=["Cluster"]), print("Best Cluster - {} Silhouette Score - {}".format (best_cluster,sil_score_max))



In [160]:
Similarity(data)

Best Cluster - 3 Silhouette Score - 0.17376330419965946


(                                                 text  Cluster
 0   Bhagwant Mann's Swearing-In Ceremony LIVE | Pu...        0
 5   Punjab CM swearing-in LIVE updates: Bhagwant M...        0
 10  Bhagwant Mann Swearing-In Live: AAP Leader Tak...        0
 2   PM Has Tremendous Vigour: Shashi Tharoor Credi...        1
 3   Shashi Tharoor explains why Congress is worth ...        1
 4                  Do you know what you are fight for        1
 6   Centre Rings Alarm Amid Covid Comeback in Chin...        1
 7   Fourth wave of Covid-19 in China, Hong Kong ra...        1
 8   Govt sounds Covid alert, calls for strict vigi...        1
 9   You have seen Priyanka Gandhi everywhere’: Sha...        1
 11  Covid-19: Mansukh Mandaviya tells officials to...        1
 1   Bhagwant Mann swearing-in: ‘He was always in m...        2,
 None)

## Countvectorizer + Spectral Clustering

In [161]:
def Similarity(text):
    text = set (text)

    df= pd.DataFrame(data=text)
    df=df.rename(columns={0:"text"})
    
    corpus = []

    for i in range (0, len (df)):
        # remvome puntuation 
        clean_text = re.sub('[^a-zA-Z]',' ',df["text"][i])

        # lower case 
        clean_text = clean_text.lower()

        # split into token
        clean_text = clean_text.split()

        # join the text for leemitization
        clean_text = nlp(" ".join(clean_text))
        # lemmitization
        clean_text = [token.lemma_ for token in clean_text]

        #stop word removal
        clean_text = [word for word in clean_text if word not in stop_word]

        # Joining the words in sentences
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    
    # Feature Extraction by TFIDF
    tfidf = CountVectorizer()
    corpus_tfidf= tfidf.fit_transform(corpus).toarray()
    
    sil_score_max = -1 #this is the minimum possible score

    for n_clusters in range(2,len(df)):

        cls= SpectralClustering(n_clusters= n_clusters,assign_labels='discretize',random_state=7).fit(corpus_tfidf)

        labels = cls.labels_

        sil_score = silhouette_score(corpus_tfidf,labels=labels)

        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_cluster = n_clusters
             
    # use the best cluster for Spectral Clustering model
    cls = SpectralClustering(n_clusters=best_cluster,assign_labels='discretize',random_state=7).fit(corpus_tfidf) 
    
    
    # labels for the clusters 
    df['Cluster']= cls.labels_
    
    return df.sort_values(by=["Cluster"]), print("Best Cluster - {} Silhouette Score - {}".format (best_cluster,sil_score_max))



In [162]:
Similarity(data)

Best Cluster - 2 Silhouette Score - 0.1558060818857329


(                                                 text  Cluster
 0   Bhagwant Mann's Swearing-In Ceremony LIVE | Pu...        0
 5   Punjab CM swearing-in LIVE updates: Bhagwant M...        0
 10  Bhagwant Mann Swearing-In Live: AAP Leader Tak...        0
 1   Bhagwant Mann swearing-in: ‘He was always in m...        1
 2   PM Has Tremendous Vigour: Shashi Tharoor Credi...        1
 3   Shashi Tharoor explains why Congress is worth ...        1
 4                  Do you know what you are fight for        1
 6   Centre Rings Alarm Amid Covid Comeback in Chin...        1
 7   Fourth wave of Covid-19 in China, Hong Kong ra...        1
 8   Govt sounds Covid alert, calls for strict vigi...        1
 9   You have seen Priyanka Gandhi everywhere’: Sha...        1
 11  Covid-19: Mansukh Mandaviya tells officials to...        1,
 None)

## TFIDF + Kmeans

In [166]:
def Similarity(text):
    
    text = set(text)

    df= pd.DataFrame(data=text)
    df=df.rename(columns={0:"text"})
    
    corpus = []

    for i in range (0, len (df)):
        # remvome puntuation 
        clean_text = re.sub('[^a-zA-Z]',' ',df["text"][i])

        # lower case 
        clean_text = clean_text.lower()

        # split into token
        clean_text = clean_text.split()

        # join the text for leemitization
        clean_text = nlp(" ".join(clean_text))
        # lemmitization
        clean_text = [token.lemma_ for token in clean_text]

        #stop word removal
        clean_text = [word for word in clean_text if word not in stop_word]

        # Joining the words in sentences
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    
    # Feature Extraction by TFIDF
    tfidf = TfidfVectorizer()
    corpus_tfidf= tfidf.fit_transform(corpus).toarray()
    
    sil_score_max = -1 #this is the minimum possible score

    #to get the best clusters 
    for n_clusters in range(2,len(df)):

        cls= KMeans(n_clusters= n_clusters,random_state=7)

        cls.fit_transform(corpus_tfidf)

        labels = cls.predict(corpus_tfidf)

        sil_score = silhouette_score(corpus_tfidf,labels=labels)

        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_cluster = n_clusters
    
    # use the best cluster number for Kmean model
    cls = KMeans(n_clusters=best_cluster,random_state=7)
    
    cls.fit_transform(corpus_tfidf)
    
    cls.predict(corpus_tfidf)
    
    # labels for the clusters 
    df['Cluster']= cls.labels_
    
    return df.sort_values(by=["Cluster"]), print("Best Cluster - {} Silhouette Score - {}".format (best_cluster,sil_score_max))



In [167]:
Similarity(data)

Best Cluster - 4 Silhouette Score - 0.12301263706380686


(                                                 text  Cluster
 1   Bhagwant Mann swearing-in: ‘He was always in m...        0
 0   Bhagwant Mann's Swearing-In Ceremony LIVE | Pu...        1
 5   Punjab CM swearing-in LIVE updates: Bhagwant M...        1
 9   Bhagwant Mann Swearing-In Live: AAP Leader Tak...        1
 4   Centre Rings Alarm Amid Covid Comeback in Chin...        2
 6   Fourth wave of Covid-19 in China, Hong Kong ra...        2
 7   Govt sounds Covid alert, calls for strict vigi...        2
 10  Covid-19: Mansukh Mandaviya tells officials to...        2
 2   PM Has Tremendous Vigour: Shashi Tharoor Credi...        3
 3   Shashi Tharoor explains why Congress is worth ...        3
 8   You have seen Priyanka Gandhi everywhere’: Sha...        3,
 None)

## TFIDF + Spectral Clustering

In [168]:
def Similarity(text):
    text = set (text)

    df= pd.DataFrame(data=text)
    df=df.rename(columns={0:"text"})
    
    corpus = []

    for i in range (0, len (df)):
        # remvome puntuation 
        clean_text = re.sub('[^a-zA-Z]',' ',df["text"][i])

        # lower case 
        clean_text = clean_text.lower()

        # split into token
        clean_text = clean_text.split()

        # join the text for leemitization
        clean_text = nlp(" ".join(clean_text))
        # lemmitization
        clean_text = [token.lemma_ for token in clean_text]

        #stop word removal
        clean_text = [word for word in clean_text if word not in stop_word]

        # Joining the words in sentences
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    
    # Feature Extraction by TFIDF
    tfidf = TfidfVectorizer()
    corpus_tfidf= tfidf.fit_transform(corpus).toarray()
    
    sil_score_max = -1 #this is the minimum possible score

    for n_clusters in range(2,len(df)):

        cls= SpectralClustering(n_clusters= n_clusters,assign_labels='discretize',random_state=7).fit(corpus_tfidf)

        labels = cls.labels_

        sil_score = silhouette_score(corpus_tfidf,labels=labels)

        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_cluster = n_clusters
             
    # use the best cluster for Spectral Clustering model
    cls = SpectralClustering(n_clusters=best_cluster,assign_labels='discretize',random_state=7).fit(corpus_tfidf) 
    
    
    # labels for the clusters 
    df['Cluster']= cls.labels_
    
    return df.sort_values(by=["Cluster"]), print("Best Cluster - {} Silhouette Score - {}".format (best_cluster,sil_score_max))



In [169]:
Similarity(data)

Best Cluster - 3 Silhouette Score - 0.12433158476654697


(                                                 text  Cluster
 1   Bhagwant Mann swearing-in: ‘He was always in m...        0
 4   Centre Rings Alarm Amid Covid Comeback in Chin...        0
 6   Fourth wave of Covid-19 in China, Hong Kong ra...        0
 7   Govt sounds Covid alert, calls for strict vigi...        0
 10  Covid-19: Mansukh Mandaviya tells officials to...        0
 2   PM Has Tremendous Vigour: Shashi Tharoor Credi...        1
 3   Shashi Tharoor explains why Congress is worth ...        1
 8   You have seen Priyanka Gandhi everywhere’: Sha...        1
 0   Bhagwant Mann's Swearing-In Ceremony LIVE | Pu...        2
 5   Punjab CM swearing-in LIVE updates: Bhagwant M...        2
 9   Bhagwant Mann Swearing-In Live: AAP Leader Tak...        2,
 None)

## Bert + Kmeans

In [177]:
def Similarity(text):
    
    text = set(text)

    df= pd.DataFrame(data=text)
    df=df.rename(columns={0:"text"})
    
    corpus = []

    for i in range (0, len (df)):
        # remvome puntuation 
        clean_text = re.sub('[^a-zA-Z]',' ',df["text"][i])

        # lower case 
        clean_text = clean_text.lower()

        # split into token
        clean_text = clean_text.split()

        # join the text for leemitization
        clean_text = nlp(" ".join(clean_text))
        # lemmitization
        clean_text = [token.lemma_ for token in clean_text]

        #stop word removal
        clean_text = [word for word in clean_text if word not in stop_word]

        # Joining the words in sentences
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    
    # Feature Extraction by Bert
    corpus_word2vec= sbert_model.encode(corpus)
    
    sil_score_max = -1 #this is the minimum possible score

    #to get the best clusters 
    for n_clusters in range(2,len(df)):

        cls= KMeans(n_clusters= n_clusters,random_state=7)

        cls.fit_transform(corpus_word2vec)

        labels = cls.predict(corpus_word2vec)

        sil_score = silhouette_score(corpus_word2vec,labels=labels)

        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_cluster = n_clusters
            best_cluster = n_clusters
    
    # use the best cluster number for Kmean model
    cls = KMeans(n_clusters=best_cluster,random_state=7)
    
    cls.fit_transform(corpus_word2vec)
    
    cls.predict(corpus_word2vec)
    
    # labels for the clusters 
    df['Cluster']= cls.labels_

       
    return df.sort_values(by=["Cluster"]), print("Best Cluster - {} Silhouette Score - {}".format (best_cluster,sil_score_max))



In [178]:
Similarity(data)

Best Cluster - 4 Silhouette Score - 0.23531098663806915


(                                                 text  Cluster
 0   Bhagwant Mann's Swearing-In Ceremony LIVE | Pu...        0
 1   Bhagwant Mann swearing-in: ‘He was always in m...        0
 5   Punjab CM swearing-in LIVE updates: Bhagwant M...        0
 9   Bhagwant Mann Swearing-In Live: AAP Leader Tak...        0
 7   Govt sounds Covid alert, calls for strict vigi...        1
 10  Covid-19: Mansukh Mandaviya tells officials to...        1
 4   Centre Rings Alarm Amid Covid Comeback in Chin...        2
 6   Fourth wave of Covid-19 in China, Hong Kong ra...        2
 2   PM Has Tremendous Vigour: Shashi Tharoor Credi...        3
 3   Shashi Tharoor explains why Congress is worth ...        3
 8   You have seen Priyanka Gandhi everywhere’: Sha...        3,
 None)

## BERT Spectral Clustering

In [181]:
def Similarity(text):
    
    text = set(text)

    df= pd.DataFrame(data=text)
    df=df.rename(columns={0:"text"})
    
    corpus = []

    for i in range (0, len (df)):
        # remvome puntuation 
        clean_text = re.sub('[^a-zA-Z]',' ',df["text"][i])

        # lower case 
        clean_text = clean_text.lower()

        # split into token
        clean_text = clean_text.split()

        # join the text for leemitization
        clean_text = nlp(" ".join(clean_text))
        # lemmitization
        clean_text = [token.lemma_ for token in clean_text]

        #stop word removal
        clean_text = [word for word in clean_text if word not in stop_word]

        # Joining the words in sentences
        clean_text = ' '.join(clean_text)
        corpus.append(clean_text)
    
    # Feature Extraction by Bert
    corpus_word2vec= sbert_model.encode(corpus)
    
    sil_score_max = -1 #this is the minimum possible score
    
    #to get the best clusters 
    for n_clusters in range(2,len(df)):

        cls= SpectralClustering(n_clusters= n_clusters,assign_labels='discretize',random_state=40).fit(corpus_word2vec)

        labels = cls.labels_

        sil_score = silhouette_score(corpus_word2vec,labels=labels)

        if sil_score > sil_score_max:
            sil_score_max = sil_score
            best_cluster = n_clusters
    

    # use the best cluster for Spectral Clustering model
    cls = SpectralClustering(n_clusters=best_cluster,assign_labels='discretize',random_state=7).fit(corpus_word2vec) 
    
    # labels for the clusters 
    df['Cluster']= cls.labels_
    
    return df.sort_values(by=["Cluster"]), print("Best Cluster - {} Silhouette Score - {}".format (best_cluster,sil_score_max))



In [182]:
Similarity(data)

Best Cluster - 5 Silhouette Score - 0.23600172996520996


(                                                 text  Cluster
 7   Govt sounds Covid alert, calls for strict vigi...        0
 10  Covid-19: Mansukh Mandaviya tells officials to...        0
 2   PM Has Tremendous Vigour: Shashi Tharoor Credi...        1
 3   Shashi Tharoor explains why Congress is worth ...        1
 8   You have seen Priyanka Gandhi everywhere’: Sha...        1
 1   Bhagwant Mann swearing-in: ‘He was always in m...        2
 4   Centre Rings Alarm Amid Covid Comeback in Chin...        3
 6   Fourth wave of Covid-19 in China, Hong Kong ra...        3
 0   Bhagwant Mann's Swearing-In Ceremony LIVE | Pu...        4
 5   Punjab CM swearing-in LIVE updates: Bhagwant M...        4
 9   Bhagwant Mann Swearing-In Live: AAP Leader Tak...        4,
 None)

## Conclusion

* Text Embedding with BERT and Spectral Clustering segregated the News Title really very well.
* It even segregated the Covid News with Covid news related to China into 2 clusters.
