In [75]:
import re
import numpy as np
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import spacy
from elasticsearch import Elasticsearch
from datetime import datetime
from elasticsearch.helpers import bulk


In [70]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],
    http_auth=('emad2', 'emadmassri'))

index_name = "news_index"

  es = Elasticsearch(


In [71]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

configurations = {
    "mappings": {
        "properties": {
            "Title": {
                "type": "text",
                "analyzer": "autocomplete_analyzer",
            },
            "Content": {
                "type": "text",
                "analyzer": "content_analyzer"
            },
            "Authors": {
                "type": "nested",
                "properties": {
                    "first_name": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "last_name": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "email": {
                        "type": "keyword"
                    }
                }
            },
            "Date": {
                "type": "date"
            },
            "Coordinates": {
                "type": "geo_point"
            },
            "TemporalExpressions": {
                "type": "nested"
            },
            "Georeferences": {
                "type": "nested"
            }
        }
    },
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 1,
        "analysis": {
            "analyzer": {
                "autocomplete_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "autocomplete_filter"
                    ]
                },
                "content_analyzer": {
                    "type": "custom",
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "stop_filter",
                        "stemmer_filter"
                    ]
                }
            },
            "filter": {
                "autocomplete_filter": {
                    "type": "edge_ngram",
                    "min_gram": 3,
                    "max_gram": 15
                },
                "stop_filter": {
                    "type": "stop",
                    "stopwords": "_english_"
                },
                "stemmer_filter": {
                    "type": "porter_stem"
                }
            }
        }
    }
}



es.indices.create(index=index_name, ignore=400, body=configurations)

  es.indices.create(index=index_name, ignore=400, body=configurations)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'news_index'})

In [72]:
nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="geo_app", timeout=10)

Tokenize the text by convert to lowercase and remove stop words and tokens with length < 3, and perform stemming

In [19]:
def clean_content(content):
    clean_content = re.sub(r'<[^>]+>', '', content)
    
    tokens = word_tokenize(clean_content.lower())  
    
    
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    
    processed_text = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) >= 3]
    processed_paragraph = ' '.join(processed_text)
    
    
    return processed_paragraph

In [20]:
def extract_author_info(author_tag):
    authors_info = []
    
    if author_tag:
        author_text = author_tag.get_text().strip().replace('by ', '')
        authors = author_text.split(',')
        
        
        for author in authors:
            parts = author.strip().split()
            
            
            if len(parts) >= 1:
                first_name = parts[0]
                last_name = ' '.join(parts[1:])
                
                
                author_info = {
                    "first_name": first_name.strip(),
                    "last_name": last_name.strip()if last_name else None,
                    "email": None  
                }
                authors_info.append(author_info)
    
    
    return authors_info if authors_info else None


In [21]:
def convert_date(date_tags):
    date_obj = None
    
    if date_tags and isinstance(date_tags, list): 
        date_tag = date_tags[0]
        date_str = date_tag.text.strip()

        try:
            date_obj = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S.%f")
        except ValueError:
            print(f"Error: Unable to parse date string '{date_str}'")
    
    return date_obj


In [76]:
def process_reuters_tags(soup):
    documents = [] 
    for reuters_tag in soup.find_all('reuters'):
        
        date = convert_date([reuters_tag.find('date')]) if reuters_tag.find('date') else None
        
        
        topics = [topic.get_text() for topic in reuters_tag.find('topics').find_all('d')] if reuters_tag.find('topics') else None
        
        
        places = [place.get_text() for place in reuters_tag.find('places').find_all('d')] if reuters_tag.find('places') else None
        
        
        title = reuters_tag.find('title').get_text() if reuters_tag.find('title') else None
        
        
        author_tag = reuters_tag.find('author')
        author_info = extract_author_info(author_tag) if author_tag else None
        
        
        content = reuters_tag.find('text').get_text() if reuters_tag.find('text') else None
        
        
        content = clean_content(content) if content else None
        
        georeferences = []
        coordinates = []
        
        
        for place_name in places or []:
            location = geolocator.geocode(place_name)
            if location:
                georeferences.append({
                    "name": place_name,
                })
                coordinates.append({'latitude': location.latitude, 'longitude': location.longitude})
                
        average_location = {
            "lat": np.mean([point["latitude"] for point in coordinates]),
            "lon": np.mean([point["longitude"] for point in coordinates])
        } if coordinates else None
        
        temporal_expressions = []
        if content:
            doc = nlp(content)
            temporal_expressions =[{
                "DATE": ent.text,
            } for ent in doc.ents if ent.label_ == 'DATE']
        
        
        document_dict = {
            "Topics": topics,
            "Title": title,
            "Content": content,
            "Authors": author_info,
            "Date": date,
            "Georeferences": georeferences,
            "Coordinates": average_location,
            "TemporalExpressions": temporal_expressions,
        }
        documents.append(document_dict)
        
        # es.index(index=index_name, body=document_dict)
    bulk_data = [
        {
            "_index": index_name,
            "_source": doc
        }
        for doc in documents
    ]
    success, failed = bulk(es, bulk_data)
    
    
    if failed:
        print(f"Failed indexing: {failed}")

In [78]:
for filename in os.listdir('./archive'):
    if filename.endswith('.sgm'):
        file_path = os.path.join('./archive', filename)
        
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            sgm_content = file.read()
        
        soup = BeautifulSoup(sgm_content, 'html.parser')
        
        process_reuters_tags(soup)

Successfully indexed: 8
Failed indexing: []
Successfully indexed: 1000
Failed indexing: []
Successfully indexed: 1000
Failed indexing: []


KeyboardInterrupt: 

In [66]:
def suggest_titles(query):
    suggestion = es.search(
        index=index_name,
        body={
            "query": {
                "match_phrase": {
                    "Title": query
                }
            }
        }
    )
    return suggestion


In [67]:
query = "BAHIA COCOA "
suggestions = suggest_titles(query)

if suggestions:
    print(f"Suggestions for '{query}':")    
    
    print(suggestions['hits'])
else:
    print("No suggestions found.")


Suggestions for 'BAHIA COCOA ':
{'total': {'value': 1, 'relation': 'eq'}, 'max_score': 41.784885, 'hits': [{'_index': 'news_index', '_id': 'R3L1l4wBRFygwJUW4jiU', '_score': 41.784885, '_source': {'Topics': ['cocoa'], 'Title': 'BAHIA COCOA REVIEW', 'Content': 'bahia cocoa review salvador feb shower continu throughout week bahia cocoa zone allevi drought sinc earli januari improv prospect come temporao although normal humid level restor comissaria smith said weekli review dri period mean temporao late year arriv week end februari 155,221 bag kilo make cumul total season 5.93 mln 5.81 stage last year seem cocoa deliv earlier consign includ arriv figur comissaria smith said still doubt much old crop cocoa still avail harvest practic come end total bahia crop estim around 6.4 mln bag sale stand almost 6.2 mln hundr thousand bag still hand farmer middlemen export processor doubt much cocoa would fit export shipper experienc dificulti obtain +bahia superior+ certif view lower qualiti recent w