In [20]:
#pip install tf-keras

In [21]:
from sentence_transformers import SentenceTransformer

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

from sklearn.metrics import silhouette_score


In [26]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [22]:
from pymongo import MongoClient

URI = "mongodb://localhost:27017/"
client = MongoClient(URI)
db = client['neutra_news_test']
news_articles_collection = db['news_articles']

In [23]:
pipeline = [
    {
        '$match': { 'group_headline' : {'$exists' : False } }
    },
    {
        '$group' : {
            '_id' : {
                '$dateToString': {
                    'format': '%Y-%m-%d',  # Format the date to include only year, month, and day
                    'date': '$publish_date'
                }
            },
            'titles' : {'$push' : '$title'},
            'ids' : {'$push' : '$_id'}
        }
    },
    {
        '$sort': {'_id': 1}  # Sort by the formatted date in ascending order
    }
]
aggregated_result = news_articles_collection.aggregate(pipeline)
aggregated_result = list(aggregated_result)
len(aggregated_result)

25

In [24]:
class EmbeddingTransformer(BaseEstimator , TransformerMixin):
    
    def __init__(self , model_name = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        
    def fit(self , X , y=None):
        return self
    
    def transform(self , X):
        return self.model.encode(X)
    
embeddingTransformer = EmbeddingTransformer()
pca = PCA(n_components=0.90)
dbscan = DBSCAN(eps=0.6 , min_samples=2)

pipeline = Pipeline([
    ('embedding' , embeddingTransformer),
    ('pca' , pca),
    ('dbscan' , dbscan)
])




In [25]:
group_headlines_collection = db['group_headlines']

def get_articles_grouped_by_date():
    aggregate_query = [
        {
            '$group' : {
                '_id' : {
                    '$dateToString': {
                        'format': '%Y-%m-%d',  # Format the date to include only year, month, and day
                        'date': '$publish_date'
                    }
                },
                'titles' : {'$push' : '$title'},
                'ids' : {'$push' : '$_id'}
            }
        },
        {
            '$sort': {'_id': 1}  # Sort by the formatted date in ascending order
        }
    ]
    aggregated_result = news_articles_collection.aggregate(aggregate_query)
    aggregated_result = list(aggregated_result)
    
    return aggregated_result

def print_silhouette_score(titles, labels):
    titles_embeddings = SentenceTransformer('all-MiniLM-L6-v2').encode(titles)
    print(len(titles_embeddings), len(labels))
    
    unique_labels = set(labels)
    if len(unique_labels) > 1:  # At least 2 distinct labels are needed
        score = silhouette_score(titles_embeddings, labels)
        print('Silhouette:', score)
    else:
        print('Not enough distinct labels to compute silhouette score.')



def assign_headlines():
    articles_grouped_by_date = get_articles_grouped_by_date()
    
    for articles_in_day in articles_grouped_by_date:
        blindspots = 0
        titles = articles_in_day['titles']
        title_ids = articles_in_day['ids']
        
        labels = pipeline.fit_predict(titles)
        title_with_ids_labels = zip(title_ids , titles , labels)
        title_with_ids_labels = sorted(title_with_ids_labels , key = lambda x : x[2])
        
        print_silhouette_score(titles , labels)
        continue
        #we need to have one single headline for a given cluster , so assign a single headline to each cluster label
        headlines_map = {}
        for article in title_with_ids_labels:
            if (article[2] == -1 or article[2] in headlines_map) : #if it is an anomaly or already processed then skip
                continue
            else:
                headlines_map[article[2]] = article[1]
        
        #now we need to insert headlines into db and get their ids so we can assign the id of a headline to their 
        #respective clusters
        headlines_objectId_map = {}
        for key , value in headlines_map.items():
            inserted_doc = group_headlines_collection.insert_one({'title' : value , 'date' : articles_in_day['_id'] , 
                                                                  'blindspot' : False})
            headlines_objectId_map[key] = inserted_doc.inserted_id
        
        #now we need to update news articles with the new 'group_headline' and 'blindspot' fields
        for article in title_with_ids_labels:
            if(article[2] == -1):
                blindspots += 1
                inserted_headline = group_headlines_collection.insert_one(
                    {
                     'title' : article[1] , 
                     'date' : articles_in_day['_id'] , 
                     'blindspot' : True
                    }
                )

                updated_article = news_articles_collection.update_one(
                    {'_id' : article[0]} ,
                    {'$set' : {'group_headline': inserted_headline.inserted_id , 'blindspot' : True}}
                )
            else:
                updated_doc = news_articles_collection.update_one(
                    {'_id':article[0]} ,
                    {'$set':{'group_headline':headlines_objectId_map[article[2]] ,'blindspot' : False}}
                )
        print('Total articles on date ' , articles_in_day['_id'] , " : " , len(titles))
        print('Total headlines : ' , len(headlines_map))
        print('Total blindspots : ' , blindspots)
    
assign_headlines()



7 7
Not enough distinct labels to compute silhouette score.




6 6
Not enough distinct labels to compute silhouette score.




18 18
Silhouette: 0.123354994




31 31
Silhouette: 0.35830185




42 42
Silhouette: 0.15336907




43 43
Silhouette: 0.24541554




40 40
Silhouette: 0.24476735




36 36
Silhouette: 0.15917109




44 44
Silhouette: 0.058846366




43 43
Silhouette: 0.21189731




27 27
Silhouette: 0.25932348




44 44
Silhouette: 0.23100811




52 52
Silhouette: 0.2788058




28 28
Silhouette: 0.12293905




38 38
Silhouette: 0.21139765




18 18
Silhouette: 0.12124349




30 30
Silhouette: 0.3855828




20 20
Silhouette: 0.3130794




26 26
Silhouette: 0.1734303




42 42
Silhouette: 0.22777131




46 46
Silhouette: 0.25286952




15 15
Silhouette: 0.4170494




41 41
Silhouette: 0.10939828




36 36
Silhouette: 0.19987768




39 39
Silhouette: 0.1359925
