In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import networkx as nx
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import spacy
import gensim
from gensim.models import CoherenceModel
from gensim.models.phrases import Phrases, Phraser
from bertopic import BERTopic
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer
import torch
import warnings
warnings.filterwarnings('ignore')

### 1. Data Understanding & Preparation

1. I will load and explore the dataset structure.
2. Then I will examine the articles content, sources, publication dates, and metadata.
3. I will clean the data by removing web crawl artifacts (HTML tags, etc).
4. I will filter out irrelevant articles that don't focus on AI's impact on industries.

In [2]:
import pandas as pd
df = pd.read_parquet('https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet', engine='pyarrow')
print(f"Dataset shape: {df.shape}")
df.info() 

Dataset shape: (200083, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200083 entries, 0 to 200082
Data columns (total 5 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   url       200083 non-null  object
 1   date      200083 non-null  object
 2   language  200083 non-null  object
 3   title     200083 non-null  object
 4   text      200083 non-null  object
dtypes: object(5)
memory usage: 7.6+ MB


In [3]:
df.head()  

Unnamed: 0,url,date,language,title,text
0,http://businessnewsthisweek.com/business/infog...,2023-05-20,en,Infogain AI Business Solutions Now Available i...,\n\nInfogain AI Business Solutions Now Availab...
1,https://allafrica.com/stories/202504250184.html,2025-04-25,en,Africa: AI Policies in Africa - Lessons From G...,\nAfrica: AI Policies in Africa - Lessons From...
2,https://asiatimes.com/2023/07/yang-lan-intervi...,2023-07-25,en,Yang Lan interviews academics on AI developmen...,\nYang Lan interviews academics on AI developm...
3,https://cdn.meritalk.com/articles/commerce-nom...,2025-02-04,en,Commerce Nominee Promises Increased Domestic A...,\nCommerce Nominee Promises Increased Domestic...
4,https://citylife.capetown/hmn/uncategorized/re...,2023-11-11,en,Revolutionizing the Manufacturing Industry: Th...,Revolutionizing the Manufacturing Industry:...


In [4]:
# Data Cleaning removing html, punctuation, and stop words
def clean_article(text):
    import re
    from bs4 import BeautifulSoup
    from nltk.corpus import stopwords

    # Removing HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Removing punctuation
    text = re.sub(r'[^\w\s]', '', text)

    # Removing the stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])

    return text

df['cleaned_article'] = df['text'].apply(clean_article)

In [5]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])  
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['yearmonth'] = df['date'].dt.strftime('%Y-%m')

In [6]:
# Relevance Filtering checking if the article discusses AI impact on industries or jobs
def is_relevant(text):
    keywords = ['AI', 'artificial intelligence', 'impact', 'industries', 'jobs', 'employment']
    text = text.lower()
    return any(keyword in text for keyword in keywords)

df_relevant = df[df['cleaned_article'].apply(is_relevant)]

In [7]:
# Topic Modeling
from bertopic import BERTopic
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(df_relevant['cleaned_article'].tolist())
#topic_model.visualize_topics()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [8]:
# Identifying Industries
def extract_industries(text):
    industries = ['healthcare', 'finance', 'manufacturing', 'transportation', 'education']
    text = text.lower()
    found_industries = [industry for industry in industries if industry in text]
    return found_industries

df_relevant['industries'] = df_relevant['cleaned_article'].apply(extract_industries)

In [9]:
#%pip install textblob

In [11]:
# Sentiment Analysis for AI adoption in specific industries
def analyze_sentiment(text, industry):
    from textblob import TextBlob
    sentiment = TextBlob(text).sentiment.polarity
    if industry:
        return sentiment * len(industry) 
    return sentiment
df_relevant['sentiment'] = df_relevant.apply(lambda x: analyze_sentiment(x['cleaned_article'], x['industries']), axis=1)

In [12]:
# Timeline Analysis
df_relevant['date'] = pd.to_datetime(df_relevant['date'])

# Exploding the industries column to handle lists
df_exploded = df_relevant.explode('industries')

# Grouping by date and industries to calculate mean sentiment
sentiment_over_time = df_exploded.groupby(['date', 'industries'])['sentiment'].mean().reset_index()

In [13]:
# Identifying AI technologies mentioned
def extract_technologies(text):
    technologies = ['machine learning', 'deep learning', 'natural language processing', 'computer vision']
    text = text.lower()
    found_technologies = [tech for tech in technologies if tech in text]
    return found_technologies

df_relevant['technologies'] = df_relevant['cleaned_article'].apply(extract_technologies)
