In [1]:
#pip install tf-keras

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md








In [12]:
from pymongo import MongoClient

URI = "mongodb://localhost:27017/"
client = MongoClient(URI)
db = client['neutra_news_test']
news_articles_collection = db['news_articles']

In [13]:
pipeline = [
    {
        '$match': { 'group_headline' : {'$exists' : False } }
    },
    {
        '$group' : {
            '_id' : {
                '$dateToString': {
                    'format': '%Y-%m-%d',  # Format the date to include only year, month, and day
                    'date': '$publish_date'
                }
            },
            'titles' : {'$push' : '$title'},
            'ids' : {'$push' : '$_id'}
        }
    },
    {
        '$sort': {'_id': 1}  # Sort by the formatted date in ascending order
    }
]
aggregated_result = news_articles_collection.aggregate(pipeline)
aggregated_result = list(aggregated_result)
aggregated_result

[{'_id': '2024-08-10',
  'titles': ["Faisalabad police launch 'all-female Dolphin Squad' to counter crime",
   'Tessori appears unfazed by reports of change at Sindh Governor House',
   'Severe consequences to follow if SC verdicts not implemented: Justice Mansoor Ali Shah',
   'No political stability without Imran Khan: PTI vice chairman',
   'Arshad Nadeem to be honoured with Hilal-e-Imtiaz for bringing Olympic glory home',
   'PTI calls for institutions to stay within their limits',
   "Gandapur stands firm on May 9 probe demand, matter to be sent to cabinet after PHC's refusal"],
  'ids': [ObjectId('66bd18ebdd9b75b8f8ae0833'),
   ObjectId('66bd18ebdd9b75b8f8ae0834'),
   ObjectId('66bd18ebdd9b75b8f8ae0835'),
   ObjectId('66bd18ebdd9b75b8f8ae0836'),
   ObjectId('66bd18ebdd9b75b8f8ae0837'),
   ObjectId('66bd18ebdd9b75b8f8ae0838'),
   ObjectId('66bd18ebdd9b75b8f8ae0839')]},
 {'_id': '2024-08-11',
  'titles': ['Islamabad bans toy horns for 10 days as nation preps for 14th Aug celebratio

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator , TransformerMixin
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

class EmbeddingTransformer(BaseEstimator , TransformerMixin):
    
    def __init__(self , model_name = "all-MiniLM-L6-v2"):
        self.model = SentenceTransformer(model_name)
        
    def fit(self , X , y=None):
        return self
    
    def transform(self , X):
        return self.model.encode(X)
    
embeddingTransformer = EmbeddingTransformer()
pca = PCA(n_components=0.95)
dbscan = DBSCAN(eps=0.6 , min_samples=2)

pipeline = Pipeline([
    ('embedding' , embeddingTransformer),
    ('pca' , pca),
    ('dbscan' , dbscan)
])




In [15]:
group_headlines_collection = db['group_headlines']

def get_articles_grouped_by_date():
    aggregate_query = [
        {
            '$group' : {
                '_id' : {
                    '$dateToString': {
                        'format': '%Y-%m-%d',  # Format the date to include only year, month, and day
                        'date': '$publish_date'
                    }
                },
                'titles' : {'$push' : '$title'},
                'ids' : {'$push' : '$_id'}
            }
        },
        {
            '$sort': {'_id': 1}  # Sort by the formatted date in ascending order
        }
    ]
    aggregated_result = news_articles_collection.aggregate(aggregate_query)
    aggregated_result = list(aggregated_result)
    
    return aggregated_result
    

def assign_headlines():
    articles_grouped_by_date = get_articles_grouped_by_date()
    
    for articles_in_day in articles_grouped_by_date:
        blindspots = 0
        titles = articles_in_day['titles']
        title_ids = articles_in_day['ids']
        
        labels = pipeline.fit_predict(titles)
        title_with_ids_labels = zip(title_ids , titles , labels)
        title_with_ids_labels = sorted(title_with_ids_labels , key = lambda x : x[2])
        
        #we need to have one single headline for a given cluster , so assign a single headline to each cluster label
        headlines_map = {}
        for article in title_with_ids_labels:
            if (article[2] == -1 or article[2] in headlines_map) : #if it is an anomaly or already processed then skip
                continue
            else:
                headlines_map[article[2]] = article[1]
        
        #now we need to insert headlines into db and get their ids so we can assign the id of a headline to their 
        #respective clusters
        headlines_objectId_map = {}
        for key , value in headlines_map.items():
            inserted_doc = group_headlines_collection.insert_one({'title' : value , 'date' : articles_in_day['_id'] , 'blindspot' : False})
            headlines_objectId_map[key] = inserted_doc.inserted_id
        
        #now we need to update news articles with the new 'group_headline' and 'blindspot' fields
        for article in title_with_ids_labels:
            if(article[2] == -1):
                blindspots += 1
                inserted_headline = group_headlines_collection.insert_one(
                    {
                     'title' : article[1] , 
                     'date' : articles_in_day['_id'] , 
                     'blindspot' : True
                    }
                )

                updated_article = news_articles_collection.update_one(
                    {'_id' : article[0]} ,
                    {'$set' : {'group_headline': inserted_headline.inserted_id , 'blindspot' : True}}
                )
            else:
                updated_doc = news_articles_collection.update_one(
                    {'_id':article[0]} ,
                    {'$set':{'group_headline':headlines_objectId_map[article[2]] ,'blindspot' : False}}
                )
        print('Total articles on date ' , articles_in_day['_id'] , " : " , len(titles))
        print('Total headlines : ' , len(headlines_map))
        print('Total blindspots : ' , blindspots)
    
assign_headlines()

Total articles on date  2024-08-10  :  7
Total headlines :  0
Total blindspots :  7
Total articles on date  2024-08-11  :  6
Total headlines :  0
Total blindspots :  6
Total articles on date  2024-08-12  :  18
Total headlines :  2
Total blindspots :  14
Total articles on date  2024-08-13  :  31
Total headlines :  9
Total blindspots :  13
Total articles on date  2024-08-14  :  42
Total headlines :  8
Total blindspots :  25
Total articles on date  2024-08-15  :  43
Total headlines :  10
Total blindspots :  21
Total articles on date  2024-08-16  :  40
Total headlines :  8
Total blindspots :  20
Total articles on date  2024-08-17  :  36
Total headlines :  8
Total blindspots :  19
Total articles on date  2024-08-18  :  44
Total headlines :  4
Total blindspots :  34
Total articles on date  2024-08-19  :  43
Total headlines :  9
Total blindspots :  21
Total articles on date  2024-08-20  :  27
Total headlines :  6
Total blindspots :  15
Total articles on date  2024-08-21  :  44
Total headlines