In [13]:
import re
import spacy

# Step 1: Load the File
def load_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

# Step 2: Preprocess the Text to Remove HTML Tags
def preprocess_text(content):
    # Remove HTML tags and normalize whitespace
    cleaned_text = re.sub(r'<.*?>', '', content)  # Remove all HTML tags
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Normalize whitespace
    return cleaned_text

# Step 3: Extract Background Section Using NLP and Date Detection
def extract_background_section(text, nlp_model):
    # List of possible titles for background sections
    possible_headings = [
        'background of the merger', 'merger timeline', 'history of the merger',
        'background to the proposed buyout', 'merger events', 'merger history', 'timeline'
    ]
    
    # Look for the start of the section with any of the potential headings (case insensitive)
    start_pattern = r'(' + '|'.join([re.escape(keyword) for keyword in possible_headings]) + r')'
    match = re.search(start_pattern, text, re.IGNORECASE)
    
    if not match:
        return "Could not find a Background section."
    
    # Extract section text starting from the match (where background starts)
    start_idx = match.end()
    section_text = text[start_idx:]

    # Define end conditions based on common next section titles
    end_keywords = ['recommendation', 'board approval', 'conclusion', 'approval of the merger', 'terms of the merger']
    
    # Look for the end of the section by matching common keywords
    for keyword in end_keywords:
        end_pattern = r'\b' + re.escape(keyword) + r'\b'
        end_match = re.search(end_pattern, section_text, re.IGNORECASE)
        if end_match:
            section_text = section_text[:end_match.start()]
            break

    return section_text.strip()

# Step 4: Extract Chronological Events (timeline-like sentences)
def extract_chronological_events(relevant_text):
    # Regex to match date patterns (simple match for dates and month/year)
    date_pattern = r'(\b\d{4}\b|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\b\s\d{1,2},?\s\d{4})'
    events = [sentence for sentence in relevant_text.split('. ') if re.search(date_pattern, sentence)]
    return events

# Step 5: Filter Events Using NLP
def filter_events_with_nlp(events, nlp_model, context_keywords):
    filtered_events = []
    for event in events:
        doc = nlp_model(event)
        if any(token.lemma_ in context_keywords for token in doc):
            filtered_events.append(event)
    return filtered_events

# Main Execution
def main():
    # Load the text file
    file_path = './sample2.txt'
    content = load_file(file_path)
    cleaned_text = preprocess_text(content)

    # Load spaCy model
    nlp = spacy.load('en_core_web_sm')

    # Extract Background Section (timeline-like data)
    background_section = extract_background_section(cleaned_text, nlp)
    if background_section == "Could not find a Background section.":
        print(background_section)
        return

    # Extract chronological events (timeline of the merger)
    events = extract_chronological_events(background_section)

    # Context keywords for NLP filtering (e.g., merger-related terms)
    context_keywords = ['merge', 'acquire', 'acquisition', 'merger']

    # Filter events with NLP
    filtered_events = filter_events_with_nlp(events, nlp, context_keywords)

    # Output filtered events (timeline)
    for event in filtered_events:
        print(event)

# Run the script
if __name__ == "__main__":
    main()


&#160; THE PROPOSED BUYOUT &#160; 38.&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;&#160;On August 12, 2008, the Company announced that it had entered into a merger agreement with CVS
Through this acquisition, CVS Caremark will acquire Longs&#8217; 521 retail drugstores in California, Hawaii, Nevada and Arizona as well as its Rx America subsidiary, which offers prescription benefits management (&#8220;PBM&#8221;) services to over 8 million members and prescription drug plan benefits to approximately 450,000 Medicare beneficiaries
Further, the acquisition complements CVS Caremark&#8217;s substantial presence in Southern California and provides &#160; - 6 - COMPLAINT FOR BREACH OF FIDUCIARY DUTIES AND AIDING AND ABETTING &#160; a foundation for significant future growth throughout the nation&#8217;s largest state
&#160; &#8220;With this acquisition, we will increase accessibility to our pharmacies for consumers and put us in an even better position to grow our new Proactive