In [1]:
# Import the dependencies
import re 
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
# Set the maximum column width to 200. 
pd.set_option('max_colwidth', 200)

In [2]:
# Read the bbc_news_articles.csv file into a DataFrame.
news_articles_df = pd.read_csv('Resources/bbc_news_articles.csv')
# Display the first 20 rows. 
news_articles_df.head(20)

Unnamed: 0,news_summary
0,"According to the Financial Times, chief operating officer Peter Chernin said that News Corp is ""kicking the tires of pretty much all video games companies"".Video games are ""big business"", the pape..."
1,Asahi is predicting profits to rise 50% in 2005 as it launches a drink based on soybean peptides rather than malt.Japanese brewers are increasingly making money from beer-flavoured drinks rather t...
2,"It could cost £80m to run a UK referendum on the European constitution, ministers have revealed.Mr Leslie said the cost could not be compared with the only previous British referendum, held 30 yea..."
3,"Yukos has said a US bankruptcy court will decide whether to block Russia's impending auction of its main production arm on Thursday.Filing for bankruptcy protection in the US was ""a last resort to..."
4,"Wasps scrum-half Matt Dawson has been recalled to England's training squad ahead of the RBS Six Nations and been reinstated in the Elite Player Squad.Balshaw, Cohen, Cueto, Lewsey, Robinson, Simps..."
5,"Murder sentences should not be reduced automatically simply because of a guilty plea, says a new MPs' report.MPs criticised Home Secretary David Blunkett last year for introducing last-minute rule..."
6,"Labour MP Diane Abbot, who backs Mr Phillips' proposal of shortlists, said she had been elected along with three other ethnic minority MPs - Keith Vaz, Paul Boateng and Bernie Grant - in 1987 but ..."
7,"Flanker Colin Charvis is unlikely to play any part in Wales' final two games of the Six Nations.""He will not figure in the Scotland game and is now thought unlikely to be ready for the final game,..."
8,"Therefore, it may be preferable for parents to contribute to the Child Trust Fund which is tax free, with any gifts from relatives that take the total above the annual £1,200 limit being directed ..."
9,"S Best (Ulster), S Byrne (Leinster), R Corrigan (Leinster), L Cullen (Leinster), S Easterby (Llanelli), A Foley (Munster), J Hayes (Munster), M Horan (Munster), B Jackman (Connacht), D Leamy (Muns..."


## Preprocessing

In [3]:
# Check for null values.
news_articles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 1 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   news_summary  2225 non-null   object
dtypes: object(1)
memory usage: 17.5+ KB


In [4]:
# Remove numbers and non-alphabetic characters from the news_summary column.
news_articles_df['news_summary'] = news_articles_df['news_summary'].apply(lambda x: re.sub(r'[^a-zA-Z\s ]', '', str(x)))
news_articles_df.head(10)

Unnamed: 0,news_summary
0,According to the Financial Times chief operating officer Peter Chernin said that News Corp is kicking the tires of pretty much all video games companiesVideo games are big business the paper quote...
1,Asahi is predicting profits to rise in as it launches a drink based on soybean peptides rather than maltJapanese brewers are increasingly making money from beerflavoured drinks rather than beer ...
2,It could cost m to run a UK referendum on the European constitution ministers have revealedMr Leslie said the cost could not be compared with the only previous British referendum held years agoIn...
3,Yukos has said a US bankruptcy court will decide whether to block Russias impending auction of its main production arm on ThursdayFiling for bankruptcy protection in the US was a last resort to pr...
4,Wasps scrumhalf Matt Dawson has been recalled to Englands training squad ahead of the RBS Six Nations and been reinstated in the Elite Player SquadBalshaw Cohen Cueto Lewsey Robinson SimpsonDaniel...
5,Murder sentences should not be reduced automatically simply because of a guilty plea says a new MPs reportMPs criticised Home Secretary David Blunkett last year for introducing lastminute rules al...
6,Labour MP Diane Abbot who backs Mr Phillips proposal of shortlists said she had been elected along with three other ethnic minority MPs Keith Vaz Paul Boateng and Bernie Grant in but it took an...
7,Flanker Colin Charvis is unlikely to play any part in Wales final two games of the Six NationsHe will not figure in the Scotland game and is now thought unlikely to be ready for the final game sai...
8,Therefore it may be preferable for parents to contribute to the Child Trust Fund which is tax free with any gifts from relatives that take the total above the annual limit being directed to a dep...
9,S Best Ulster S Byrne Leinster R Corrigan Leinster L Cullen Leinster S Easterby Llanelli A Foley Munster J Hayes Munster M Horan Munster B Jackman Connacht D Leamy Munster E Miller Leinster R McCo...


## Process the Text to Tokens and Counts.

In [5]:
# Create an instance of the CountVectorizer and set the max_df to 0.95 and min_df to 5, and use the "english" stopwords.
cv = CountVectorizer(max_df=0.95, min_df=5, stop_words='english')
cv

In [6]:
# Transform each row from the news_summary to a DTM.
dtm = cv.fit_transform(news_articles_df['news_summary'])
# Get the shape of the DTM.
print(dtm.shape)

(2225, 5172)


## LDA

In [7]:
# Create and instance of the LatentDirichletAllocation() class with 5 topics.
LDA = LatentDirichletAllocation(n_components=5,random_state=42)
# Fit the model with our DTM data.
LDA_data = LDA.fit(dtm)

In [8]:
# Check the length of the vocabulary 
len(cv.get_feature_names_out())

5172

## Get the Top 15 Words Per Topic

In [9]:
# Print the top 15 words for each topic.
for index,topic in enumerate(LDA.components_):
    print(f'The Top 15 Words For Topic #{index+1}')
    print([cv.get_feature_names_out()[i] for i in topic.argsort()[-15:]])
    print('\n')

The Top 15 Words For Topic #1
['actress', 'years', 'told', 'time', 'star', 'actor', 'new', 'game', 'director', 'games', 'lord', 'mr', 'best', 'film', 'said']


The Top 15 Words For Topic #2
['wales', 'cup', 'play', 'years', 'number', 'second', 'won', 'england', 'new', 'win', 'music', 'world', 'game', 'best', 'said']


The Top 15 Words For Topic #3
['foreign', 'mr', 'new', 'uk', 'oil', 'economic', 'company', 'sales', 'economy', 'market', 'bank', 'growth', 'year', 'bn', 'said']


The Top 15 Words For Topic #4
['says', 'tax', 'howard', 'told', 'minister', 'new', 'brown', 'party', 'election', 'labour', 'blair', 'government', 'people', 'mr', 'said']


The Top 15 Words For Topic #5
['users', 'million', 'net', 'make', 'video', 'use', 'phone', 'digital', 'tv', 'music', 'technology', 'new', 'mobile', 'people', 'said']




### **Question:** What is the label for each topic? 
---
- TOPIC 1: Entertainment
- TOPIC 2: Sports
- TOPIC 3: Business
- TOPIC 4: Politics
- TOPIC 5: Technology

## Assign the Topics and Labels to the News Summaries

In [10]:
# Transform our DTM so we get an array with the (number_of_documents, number_of_topics).
topic_results = LDA.transform(dtm)

# Get the shape of the topic results
topic_results.shape

(2225, 5)

In [11]:
# Read the bbc_news_articles.csv file into a DataFrame.
news_articles_df = pd.read_csv('Resources/bbc_news_articles.csv')
# Display the DataFrame. 
news_articles_df.head()

Unnamed: 0,news_summary
0,"According to the Financial Times, chief operating officer Peter Chernin said that News Corp is ""kicking the tires of pretty much all video games companies"".Video games are ""big business"", the pape..."
1,Asahi is predicting profits to rise 50% in 2005 as it launches a drink based on soybean peptides rather than malt.Japanese brewers are increasingly making money from beer-flavoured drinks rather t...
2,"It could cost £80m to run a UK referendum on the European constitution, ministers have revealed.Mr Leslie said the cost could not be compared with the only previous British referendum, held 30 yea..."
3,"Yukos has said a US bankruptcy court will decide whether to block Russia's impending auction of its main production arm on Thursday.Filing for bankruptcy protection in the US was ""a last resort to..."
4,"Wasps scrum-half Matt Dawson has been recalled to England's training squad ahead of the RBS Six Nations and been reinstated in the Elite Player Squad.Balshaw, Cohen, Cueto, Lewsey, Robinson, Simps..."


In [12]:
# Use the add_topic_labels function to add the topic and topic label to each news summary. 
# Dictionary of  topics and topic label.
topic_labels = {
    1: 'Entertainment',
    2: 'Sports',
    3: 'Business',
    4: 'Politics',
    5: 'Technology'
}

# Define the function and pass in the DataFrame, the topic_results, and topic_labels dictionary.
def add_topic_labels(df, topic_results, topic_labels):
    # Find the dominant topic for each document and add the label to a new column
    df['topic'] = topic_results.argmax(axis=1) + 1
    # Use the map function to add the topic label to the news summary based on the topic number.
    df['topic_label'] = df['topic'].map(topic_labels)


In [13]:
# Call the function to add topic labels to your DataFrame.
add_topic_labels(news_articles_df, topic_results, topic_labels)

In [14]:
# Display the first 20 rows of the updated DataFrame. 
news_articles_df.head(20)

Unnamed: 0,news_summary,topic,topic_label
0,"According to the Financial Times, chief operating officer Peter Chernin said that News Corp is ""kicking the tires of pretty much all video games companies"".Video games are ""big business"", the pape...",5,Technology
1,Asahi is predicting profits to rise 50% in 2005 as it launches a drink based on soybean peptides rather than malt.Japanese brewers are increasingly making money from beer-flavoured drinks rather t...,3,Business
2,"It could cost £80m to run a UK referendum on the European constitution, ministers have revealed.Mr Leslie said the cost could not be compared with the only previous British referendum, held 30 yea...",3,Business
3,"Yukos has said a US bankruptcy court will decide whether to block Russia's impending auction of its main production arm on Thursday.Filing for bankruptcy protection in the US was ""a last resort to...",4,Politics
4,"Wasps scrum-half Matt Dawson has been recalled to England's training squad ahead of the RBS Six Nations and been reinstated in the Elite Player Squad.Balshaw, Cohen, Cueto, Lewsey, Robinson, Simps...",2,Sports
5,"Murder sentences should not be reduced automatically simply because of a guilty plea, says a new MPs' report.MPs criticised Home Secretary David Blunkett last year for introducing last-minute rule...",4,Politics
6,"Labour MP Diane Abbot, who backs Mr Phillips' proposal of shortlists, said she had been elected along with three other ethnic minority MPs - Keith Vaz, Paul Boateng and Bernie Grant - in 1987 but ...",4,Politics
7,"Flanker Colin Charvis is unlikely to play any part in Wales' final two games of the Six Nations.""He will not figure in the Scotland game and is now thought unlikely to be ready for the final game,...",2,Sports
8,"Therefore, it may be preferable for parents to contribute to the Child Trust Fund which is tax free, with any gifts from relatives that take the total above the annual £1,200 limit being directed ...",3,Business
9,"S Best (Ulster), S Byrne (Leinster), R Corrigan (Leinster), L Cullen (Leinster), S Easterby (Llanelli), A Foley (Munster), J Hayes (Munster), M Horan (Munster), B Jackman (Connacht), D Leamy (Muns...",2,Sports


In [15]:
# Display the last 20 rows of the updated DataFrame.
news_articles_df.tail(10)

Unnamed: 0,news_summary,topic,topic_label
2215,The new book claims Mr Prescott hosted a dinner in November 2003 where the prime minister told Mr Brown he would stand down before the next election because he had lost trust over the Iraq war.Mr ...,4,Politics
2216,"Actor Daniel Day-Lewis is to be presented with an award for his career in film at the Berlin Film Festival.Day-Lewis has competed four times at the Berlin Film Festival, with films In The Name Of ...",1,Entertainment
2217,"With so much time spent in the spotlight, Holmes has increasingly dropped hints that her ambition on the track has begun to wilt.Four months later, Holmes stormed to double Olympic gold and has si...",2,Sports
2218,"US oil prices have fallen by 6%, driven down by forecasts of a mild winter in the densely populated northeast.Statistics released last week showed that stockpiles of oil products in the US had ris...",3,Business
2219,"Instead, said Mr Doctorow, DRM systems were intended to control the group that electronics firms have most hold over - consumers.By setting up the alliance to work on a common control system, the ...",5,Technology
2220,"The Votes at 16 alliance said it was a good thing to ""engage people"" by lowering the candidacy age but argued lowering the voting age would be much more effective.That is because the age of majori...",4,Politics
2221,"West Ham boss Alan Pardew said: ""It's a shame because I thought there was good English banter in the crowd.The FA is to take action after trouble marred Wednesday's Carling Cup tie between Chelsea...",1,Entertainment
2222,"Navratilova, who made a comeback after retiring in 1994, will play doubles and mixed doubles events in 2005.I'm still that good.""Navratilova has won three Grand Slam mixed doubles titles since she...",2,Sports
2223,"""You would think they would have been available in December as it can take two months for the body to respond.""The first chance we will get to do that is at the players meeting on the Saturday bef...",2,Sports
2224,"Other than its warning on possible tax hikes, the NIESR report was optimistic about the state of the UK and global economy.The UK government will have to raise taxes or rein in spending if it want...",3,Business


**Question:** Did LDA do a good job at assigning the appropriate topic to the news summaries? 

**Answer:** Yes. Most of the news summaries look like they have been appropriately assigned the correct topic and topic label.