In [1]:
from pathlib import Path
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from bs4 import BeautifulSoup
from geopy.geocoders import Nominatim
import spacy
from elasticsearch import Elasticsearch
from datetime import datetime

In [2]:
es = Elasticsearch(
    [{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],
    http_auth=('Abdullah', 'Abdullah')
)

index_name = "reuters_news_index"

  es = Elasticsearch(


In [5]:
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

configurations = {
    "mappings": {
        "dynamic_templates": [
            {
                "dates_template": {
                    "match": "temporalExpressions",
                    "mapping": {
                        "type": "date"
                    }
                }
            },
            {
                "geopoints_template": {
                    "match": "georeferences",
                    "mapping": {
                        "type": "geo_point"
                    }
                }
            }
        ],
        "properties": {
            "Title": {
                "type": "text",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                },
                "analyzer": "autocomplete_analyzer",
                "boost": 2
            },
            "Content": {
                "type": "text",
                "analyzer": "custom_content_analyzer",
                "boost": 1
            },
            "Authors": {
                "type": "nested",
                "properties": {
                    "first_name": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "last_name": {
                        "type": "text",
                        "analyzer": "standard"
                    },
                    "email": {
                        "type": "keyword"
                    }
                }
            }
        }
    },
    "settings": {
        "analysis": {
            "analyzer": {
                "autocomplete_analyzer": {
                    "tokenizer": "autocomplete_tokenizer",
                    "filter": ["lowercase"]
                },
                "custom_content_analyzer": {
                    "tokenizer": "standard",
                    "filter": ["lowercase", "stop", "custom_length_filter", "snowball"]
                }
            },
            "tokenizer": {
                "autocomplete_tokenizer": {
                    "type": "edge_ngram",
                    "min_gram": 2,
                    "max_gram": 10,
                    "token_chars": ["letter", "digit"]
                }
            },
            "filter": {
                "custom_length_filter": {
                    "type": "length",
                    "min": 3
                }
            }
        }
    }
}

es.indices.create(index=index_name, ignore=400, body=configurations)


  es.indices.create(index=index_name, ignore=400, body=configurations)


ObjectApiResponse({'error': {'root_cause': [{'type': 'mapper_parsing_exception', 'reason': 'Unknown parameter [boost] on mapper [Content]'}], 'type': 'mapper_parsing_exception', 'reason': 'Failed to parse mapping: Unknown parameter [boost] on mapper [Content]', 'caused_by': {'type': 'mapper_parsing_exception', 'reason': 'Unknown parameter [boost] on mapper [Content]'}}, 'status': 400})

In [6]:
nlp = spacy.load("en_core_web_sm")
geolocator = Nominatim(user_agent="geo_app", timeout=10)

In [7]:
def clean_content(content):
    clean_content = re.sub(r'<[^>]+>', '', content)
    tokens = word_tokenize(clean_content.lower())

    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    
    processed_text = ' '.join(stemmer.stem(word) for word in tokens if word not in stop_words and len(word) >= 3)
    return processed_text

In [8]:
def extract_author_info(author_tag):
    if not author_tag:
        return None

    author_text = author_tag.get_text().strip().replace('by ', '')
    authors = [author.strip().split() for author in author_text.split(',') if author.strip()]

    authors_info = [
        {
            "first_name": parts[0].strip(),
            "last_name": ' '.join(parts[1:]).strip() if len(parts) > 1 else None,
            "email": None
        }
        for parts in authors
    ]

    return authors_info if authors_info else None

In [9]:
def convert_date(date_tags):
    if date_tags and isinstance(date_tags, list):
        date_string = date_tags[0].text.strip()

        try:
            return datetime.strptime(date_string, "%d-%b-%Y %H:%M:%S.%f")
        except ValueError as e:
            print(f"Error: Unable to parse date string '{date_string}': {e}")

    return None

In [10]:
def process_reuters_tags(soup):
    for reuters_tag in soup.find_all('reuters'):
        date = convert_date([reuters_tag.find('date')]) if reuters_tag.find('date') else None
        topics = [topic.get_text() for topic in reuters_tag.find('topics').find_all('d')] if reuters_tag.find('topics') else None
        places = [place.get_text() for place in reuters_tag.find('places').find_all('d')] if reuters_tag.find('places') else None
        title = reuters_tag.find('title').get_text() if reuters_tag.find('title') else None
        author_tag = reuters_tag.find('author')
        author_info = extract_author_info(author_tag) if author_tag else None
        content = reuters_tag.find('text').get_text() if reuters_tag.find('text') else None
        content = clean_content(content) if content else None

        georeferences = []
        coordinates = []
        for place_name in places or []:
            location = geolocator.geocode(place_name)
            if location:
                georeferences.append(place_name)
                coordinates.append({'latitude': location.latitude, 'longitude': location.longitude})

        temporal_expressions = []
        if content:
            doc = nlp(content)
            temporal_expressions = [ent.text for ent in doc.ents if ent.label_ == 'DATE']

        document_dict = {
            "Title": title,
            "Content": content,
            "Authors": author_info,
            "Date": date,
            "Georeferences": georeferences,
            "Coordinates": {"latitude": coordinates[0]["latitude"], "longitude": coordinates[0]["longitude"]} if coordinates else None,
            "TemporalExpressions": temporal_expressions,
        }

        es.index(index=index_name, body=document_dict)

In [11]:
directory_path = Path('./Sample_Data')

for file_path in directory_path.glob('*.sgm'):
    with file_path.open('r', encoding='utf-8', errors='ignore') as file:
        sgm_content = file.read()

    soup = BeautifulSoup(sgm_content, 'html.parser')
    process_reuters_tags(soup)