In [1]:
import re
import os
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import spacy
from elasticsearch import Elasticsearch
from datetime import datetime

In [2]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],
    http_auth=('emad2', 'emadmassri'))

index_name = "news_index"

  es = Elasticsearch(


In [3]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)




configurations ={
  "mappings": {
    "properties": {
      "Title": {
        "type": "text",
        "fields": {
          "autocomplete": {
            "type": "search_as_you_type"
          }
        }
      },
      "Content": {
        "type": "text",
        "analyzer": "custom_content_analyzer"
      },
      "Authors": {
        "type": "nested",
        "properties": {
          "first_name": { "type": "keyword" },
          "last_name": { "type": "keyword" },
          "email": { "type": "keyword" }
        }
      },
      "Date": {
        "type": "date"
      },
      "Geopoint": {
        "type": "geo_point"
      },
      "TemporalExpressions": {
        "type": "text"
      },
      "Georeferences": {
        "type": "text"
      }
    }
  },
  "settings": {
    "analysis": {
      "analyzer": {
        "custom_content_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase", "stemmer_filter"]
        }
      },
      "filter": {
        "stemmer_filter": {
          "type": "stemmer",
          "name": "english"
        }
      }
    }
  }
}


# create index
es.indices.create(index=index_name, ignore=400, body=configurations)

  es.indices.create(index=index_name, ignore=400, body=configurations)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'news_index'})

In [4]:
nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="geo_app", timeout=10)

Tokenize the text by convert to lowercase and remove stop words and tokens with length < 3, and perform stemming

In [5]:
def clean_content(content):
    clean_content = re.sub(r'<[^>]+>', '', content)
    
    tokens = word_tokenize(clean_content.lower())  
    
    
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    processed_text = [stemmer.stem(word) for word in tokens if word not in stop_words and len(word) >= 3]
    processed_paragraph = ' '.join(processed_text)
    return processed_paragraph

In [6]:
def extract_author_info(author_tag):
    authors_info = []
    if author_tag:
        author_text = author_tag.get_text().strip().replace('by ', '')
        authors = author_text.split(',')
        for author in authors:
            parts = author.strip().split()
            if len(parts) >= 1:
                first_name = parts[0]
                last_name = ' '.join(parts[1:])
                author_info = {
                    "first_name": first_name.strip(),
                    "last_name": last_name.strip()if last_name else None,
                    "email": None  
                }
                authors_info.append(author_info)
    return authors_info if authors_info else None


In [7]:
def convert_date(date_tags):
    date_obj = None
    
    if date_tags and isinstance(date_tags, list): 
        date_tag = date_tags[0]
        date_str = date_tag.text.strip()

        try:
            date_obj = datetime.strptime(date_str, "%d-%b-%Y %H:%M:%S.%f")
        except ValueError:
            print(f"Error: Unable to parse date string '{date_str}'")
    
    return date_obj


In [8]:
def process_reuters_tags(soup):
    for reuters_tag in soup.find_all('reuters'):
        
        date = convert_date([reuters_tag.find('date')]) if reuters_tag.find('date') else None
        
        
        topics = [topic.get_text() for topic in reuters_tag.find('topics').find_all('d')] if reuters_tag.find('topics') else None
        
        
        places = [place.get_text() for place in reuters_tag.find('places').find_all('d')] if reuters_tag.find('places') else None
        
        
        title = reuters_tag.find('title').get_text() if reuters_tag.find('title') else None
        
        
        author_tag = reuters_tag.find('author')
        author_info = extract_author_info(author_tag) if author_tag else None
        
        
        content = reuters_tag.find('text').get_text() if reuters_tag.find('text') else None
        
        
        content = clean_content(content) if content else None
        
        georeferences = []
        coordinates = []
        
        
        for place_name in places or []:
            location = geolocator.geocode(place_name)
            if location:
                georeferences.append(place_name)
                coordinates.append({'latitude': location.latitude, 'longitude': location.longitude})
        
        temporal_expressions = []
        if content:
            doc = nlp(content)
            temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == 'DATE']
        
        
        
        
        document_dict = {
            "Topics": topics,
            "Title": title,
            "Content": content,
            "Authors": author_info,
            "Date": date,
            "Georeferences": georeferences,
            "Coordinates": {"latitude": coordinates[0]["latitude"], "longitude": coordinates[0]["longitude"]},
            "TemporalExpressions": temporal_expressions,
        }

        
        es.index(index=index_name, body=document_dict)
        

In [9]:
for filename in os.listdir('./s'):
    if filename.endswith('.sgm'):
        file_path = os.path.join('./s', filename)

        with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
            sgm_content = file.read()

        soup = BeautifulSoup(sgm_content, 'html.parser')

        process_reuters_tags(soup)