In [2]:
import re
from bs4 import BeautifulSoup as bs
import pandas as pd

def clean_file(file_path):
    try:
        # Try to read the file.
        with open(file_path, "r", encoding="utf-8") as file:
            news_article = file.read()
    # If file is not found.
    except FileNotFoundError:
        print(f"No File: {file_path}")
        return None
    # IF file exist but can not read the file.
    except Exception:
        print(f"File Error {file_path}: {Exception}")
        return None

    # Parse the HTML content.
    sp = bs(news_article, "html.parser")

    # Get the title.
    title = sp.find("title").get_text(strip = True) \
        if sp.find("title") else "No Title" # If title is not found.

    # Get the publication date.
    date = None
    for date_tag in ["meta", "time"]:
        date_element = sp.find(date_tag, {"name": "pubdate"}) or sp.find(date_tag, {"property": "article:published_time"})
        if date_element:
            date = date_element.get("content", "").strip()
            break
        else:
            pass
    date = date or "No Date Found" # If date is not found.

    # Locate the main content container.
    main_content = None
    # For loop to check the following classes.
    for container_class in ["article-section", "main-content", "article-body", "content", "post-content", "Page-content", "col-xs-12 tjpcontainer"]:
        # Search in div classes.
        main_content = sp.find("div", {"class": container_class})
        if main_content:
            break
        else:
            pass

    if not main_content:
        main_content = sp
    else:
        pass

    # Remove unwanted elements
    for unwanted in main_content.find_all(["header", "footer", "nav", "aside", "button", "script", "style", "svg"]):
        unwanted.decompose()

    # Extract and clean the article body
    article_body = []
    for tag in main_content.find_all(["p", "div"]):
        if not any(cls in tag.attrs.get("class", []) for cls in ["ad", "sponsored", "footer", "Page-footer-bottom", "header", "nav", "sidebar", "copyright", "feedback", "col-xs-12 footer"]):
            text = tag.get_text(strip=True)
            if text:
                article_body.append(text)
            else:
                pass
        else:
            pass

    # Combine paragraphs into the cleaned content
    cleaned_content = "\n".join(article_body)

    # Verify that the cleaned content is not empty
    if not cleaned_content.strip():
        return None
    else:
        pass

    return {"title": title, "date": date, "content": cleaned_content}

def process_files(file_paths):
    cleaned_articles = [] # Empty list for clean articles
    for file_path in file_paths: # Goes through each file in the list of news.
        article = clean_file(file_path)
        if article: # If article is cleaned then append to clean list.
            cleaned_articles.append(article)
        else:
            pass
    return cleaned_articles

def match_keywords(content, keywords):
    for keyword in keywords:
        # Search and match keywords.
        if re.search(rf'\b{re.escape(keyword)}\b', content, re.IGNORECASE):
            return keyword
        else:
            pass
    return None

def group_articles_by_country(cleaned_articles, country_mapping):
    grouped_articles = {country: [] for country in country_mapping} # Country with the keywords that map each country.

    for article in cleaned_articles: # Go through each article.
        article_content = article.get("content", "").lower() # Get conetent in lowercase.

        for country, keywords in country_mapping.items(): # If a keyword is found in the content then append to the corresponding country.
            if any(keyword.lower() in article_content for keyword in keywords):
                grouped_articles[country].append(article)
                break
            else:
                # Articles that do not match any country.
                grouped_articles.setdefault("Unknown", []).append(article)

    return grouped_articles

def categorize_articles_by_type(grouped_articles, category_keywords):

    categorized_articles = {} # Empty dictionary to story article based on category.

    for country, articles in grouped_articles.items(): # Goes through the articles in each country.
        categorized_articles[country] = {"political": [], "competitor": [], "uncategorized": []} # Empty categories for each country.

        # Go through each article.
        for article in articles:
            article_content = article.get("content", "").lower() # Get the content in lowercase.

            # Match political keywords.
            matched_keyword = match_keywords(article_content, category_keywords["political"])
            if matched_keyword: # If it's matched then append article to political category.
                categorized_articles[country]["political"].append(article)
                continue

            # Match competitor keywords.
            matched_keyword = match_keywords(article_content, category_keywords["competitor"])
            if matched_keyword: # If it's matched then append article to competitor category.
                categorized_articles[country]["competitor"].append(article)
                continue

            # If no match, add to uncategorized.
            categorized_articles[country]["uncategorized"].append(article)

    return categorized_articles


def analyze_political_articles(categorized_articles, political_keywords):
    
    for country, categories in categorized_articles.items(): # Checks each category in each country.
        political_articles = categories.get("political", []) # Checks for the political category.

        print(f"\nCountry: {country} - Political Articles") # Print country and category of news.
        
        for article in political_articles: # Goes through the political articles.
            title = article.get("title", "No Title") # Takes the title.
            content = article.get("content", "").lower() # Takes content all in lower case.
            
            # Combine title and content for analysis
            combined_text = (title + " " + content).lower()
            
            # Track found keywords and their scores
            found_keywords = [] # Empty list for found keywords.
            total_scores = {'relevancy': 0, 'sentiment': 0, 'geographical': 0, 'frequency': 0} # Total scores.
            count = 0
            
            for keyword, scores in political_keywords.items(): # Goes through the list of keywords.
                # Checks if the keyword appears as a whole word within combined_text. 
                # The '\b' ensures word boundaries, and 're.escape()' makes sure special characters in the keyword are treated literally.
                if re.search(rf'\b{re.escape(keyword)}\b', combined_text, re.IGNORECASE): 
                    found_keywords.append(keyword) # If the keyword is found append.
                    total_scores['relevancy'] += scores.get('relevancy', 0) # Add relevancy score.
                    total_scores['sentiment'] += scores.get('sentiment', 0) # Add sentiment score.
                    total_scores['geographical'] += scores.get('geographical', 0) # Add geographical score.
                    total_scores['frequency'] += scores.get('frequency', 0) # Add frequency score.
                    count += 1
                else:
                    pass
            
            impact_score = 0

            if count > 0:
                avg_scores = {x: round(y / count, 2) for x, y in total_scores.items()} # Take average scores.
                impact_score = round((avg_scores["frequency"] + avg_scores["geographical"] + avg_scores["relevancy"])/3, 2) # Formula for impact score.
            else:
                avg_scores = {'relevancy': 0, 'sentiment': 0, 'geographical': 0, 'frequency': 0} # Scores are 0.

            # Print Analysis for political news.
            print("\n--- Article ---")
            print(f"Title: {title}")
            print(f"Found Keywords: {', '.join(found_keywords) if found_keywords else 'None'}")
            print(f"Average Scores: {avg_scores}, Impact Score: {impact_score}")
            print("-------------------")

def analyze_competitor_articles(categorized_articles, competitor_keywords):
    
    for country, categories in categorized_articles.items(): # Checks each category in each country.
        competitor_articles = categories.get("competitor", []) # Checks for the competitor category.

        print(f"\nCountry: {country} - Competitor Articles") # Print country and category of news
        
        for article in competitor_articles: # Goes through every article in the competitor category.
            title = article.get("title", "No Title") # Get the title.
            content = article.get("content", "").lower() # get the content.
            
            combined_text = (title + " " + content).lower() # Combine title and content for analysis.
            
            found_keywords = [] # Empty list to collect keywords found in each article.
            total_scores = {'relevancy': 0, 'sentiment': 0, 'geographical': 0, 'frequency': 0} # Total score.
            count = 0 # Number of keywords found in each article.
            
            for keyword, scores in competitor_keywords.items(): # Goes through the list of keywords.
                # Checks if the keyword appears as a whole word within combined_text. 
                # The '\b' ensures word boundaries, and 're.escape()' makes sure special characters in the keyword are treated literally.
                if re.search(rf'\b{re.escape(keyword)}\b', combined_text, re.IGNORECASE): 
                    found_keywords.append(keyword) # If the keyword is found append.
                    total_scores['relevancy'] += scores.get('relevancy', 0) # Add relevancy score.
                    total_scores['sentiment'] += scores.get('sentiment', 0) # Add sentiment score.
                    total_scores['geographical'] += scores.get('geographical', 0) # Add geographical score.
                    total_scores['frequency'] += scores.get('frequency', 0) # Add frequency score.
                    count += 1 # Add 1 to count.
                else:
                    pass

            impact_score = 0

            if count > 0: 
                avg_scores = {x: round(y / count, 2) for x, y in total_scores.items()} # Takes the averages of each score.
                impact_score = round((avg_scores["frequency"] + avg_scores["geographical"] + avg_scores["relevancy"])/3, 2) # Formula for Impact score.
            else:
                avg_scores = {'relevancy': 0, 'sentiment': 0, 'geographical': 0, 'frequency': 0} # Scores are 0.

            # Print analysis for competitor news.
            print("\n--- Article ---")
            print(f"Title: {title}")
            print(f"Found Keywords: {', '.join(found_keywords) if found_keywords else 'None'}")
            print(f"Average Scores: {avg_scores}, Impact Score: {impact_score}")
            print("-------------------")

# Competitor Keywords.
competitor_keywords = {
    "merger": {'relevancy': 3, 'sentiment': -1, 'geographical': 5, 'frequency': 1},
    "acquisition": {'relevancy': 4, 'sentiment': -3, 'geographical': 5, 'frequency': 1},
    "market share": {'relevancy': 5, 'sentiment': -4, 'geographical': 5, 'frequency': 1},
    "funding": {'relevancy': 4, 'sentiment': -5, 'geographical': 5, 'frequency': 1},
    "agoda": {'relevancy': 4, 'sentiment': -4, 'geographical': 5, 'frequency': 2},
    "traveloka": {'relevancy': 4, 'sentiment': -4, 'geographical': 5, 'frequency': 1},
    "booking.com": {'relevancy': 5, 'sentiment': -5, 'geographical': 5, 'frequency': 2}
}

# Political Keywords.
political_keywords = {
    "election": {'relevancy': 3, 'sentiment': 3, 'geographical': 5, 'frequency': 1},
    "policy": {'relevancy': 4, 'sentiment': 4, 'geographical': 4, 'frequency': 3},
    "visa": {'relevancy': 5, 'sentiment': -4, 'geographical': 5, 'frequency': 2},
    "regulations": {'relevancy': 4, 'sentiment': -3, 'geographical': 4, 'frequency': 1},
    "restrictions": {'relevancy': 4, 'sentiment': -3, 'geographical': 5, 'frequency': 1}
}

# News files.
file_paths = [
    "/Users/erickxu/Desktop/Indo_pnews1.html",
    "/Users/erickxu/Desktop/Indo_pnews2.html",
    "/Users/erickxu/Desktop/Indo_pnews3.html",
    "/Users/erickxu/Desktop/Indo_pnews4.html",
    "/Users/erickxu/Desktop/Indo_pnews5.html",
    "/Users/erickxu/Desktop/Indo_cnews1.html",
    "/Users/erickxu/Desktop/Indo_cnews2.html",
    "/Users/erickxu/Desktop/Indo_cnews3.html",
    "/Users/erickxu/Desktop/Indo_cnews4.html",
    "/Users/erickxu/Desktop/Indo_cnews5.html",
]

# Process all files.
cleaned_articles = process_files(file_paths)

# Group articles by country.
country_mapping = {
    "Indonesia": ["jakarta", "indonesia", "java", "sumatra"],
    "Malaysia": ["kuala lumpur", "malaysia", "sabah", "sarawak"],
    "Singapore": ["singapore"],
}

grouped_articles = group_articles_by_country(cleaned_articles, country_mapping)

# Categorize articles.
category_keywords = {
    "political": ["election", "government", "policy", "minister", "parliament", "law"],
    "competitor": [
        "brand", "merger", "market", "launched", "market share", "funding", "acquired", "business", "quarter",
        "competitor", "rival", "company", "startup", "deal", "tiket.com", "traveloka", "booking.com", "agoda",
        "trip.com", "acquisition"
    ],
}

categorized_articles = categorize_articles_by_type(grouped_articles, category_keywords)

# Call the functions with categorized articles and keywords.
analyze_political_articles(categorized_articles, political_keywords)
analyze_competitor_articles(categorized_articles, competitor_keywords)


Country: Indonesia - Political Articles

--- Article ---
Title: Indonesia's Social Media E-Commerce Ban
Found Keywords: regulations
Average Scores: {'relevancy': 4.0, 'sentiment': -3.0, 'geographical': 4.0, 'frequency': 1.0}, Impact Score: 3.0
-------------------

--- Article ---
Title: Indonesia Visa Requirements 2024 - Secret Retreats
Found Keywords: visa
Average Scores: {'relevancy': 5.0, 'sentiment': -4.0, 'geographical': 5.0, 'frequency': 2.0}, Impact Score: 4.0
-------------------

--- Article ---
Title: Indonesia’s New E-commerce Regulations Take a Bite Out of TikTok’s Market Share
Found Keywords: election, policy, regulations
Average Scores: {'relevancy': 3.67, 'sentiment': 1.33, 'geographical': 4.33, 'frequency': 1.67}, Impact Score: 3.22
-------------------

--- Article ---
Title: Amendment to Indonesian Visa Laws: | Bagus Enrico & Partners
Found Keywords: policy, visa, regulations
Average Scores: {'relevancy': 4.33, 'sentiment': -1.0, 'geographical': 4.33, 'frequency': 2.0}