-
Notifications
You must be signed in to change notification settings - Fork 0
/
article_to_audio.py
130 lines (105 loc) · 5.54 KB
/
article_to_audio.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
Refer to https://towardsdatascience.com/easy-text-to-speech-with-python-bfb34250036e
"""
#Imports the following modules used in this function
import os, requests, random
#Imports the Google Text to Speak module, which converts text into an mp3 file
from gtts import gTTS
#Imports a module for web scrapping =
from bs4 import BeautifulSoup
#A function that converts the article into an mp3 file
def main(URL, audio_file_name):
#Goes to the following URL
page = requests.get(URL)
#Gets the website's HTML code
soup = BeautifulSoup(page.content, 'html.parser')
#CTV currently has a current and beta website, the current website headline has a class called "articleHeadline", but if an error occurs while trying to get the headline, it runs the code to get it from the class called "inner-headline"
try:
#Gets the article's heading from CTV's current website
article_title = soup.select('h1.articleHeadline')[0].text.strip(" ")
except:
#Gets the article's heading from CTV's beta website
article_title = soup.select('h1.inner-headline')[0].text.strip(" ")
#Initalizes the name of the author incase it is not specified
author = "CTV"
#Gets all the div, p, a, span, and footer tags from the HTML code
divs = soup.find_all('div')
paragraphs = soup.find_all('p')
a_links = soup.find_all('a')
spans = soup.find_all('span')
footers = soup.find_all('footer')
#Initalizes a list that will contain all the reccomended articles
reccomended_articles = []
#A function that add non empty list items from "unfiltered_reccomended_articles" to "reccomended_articles"
def filter_text():
for reccomended_article in unfiltered_reccomended_articles:
if len(reccomended_article) > 0:
#Removes additional spaces and adds that item to the list called "reccomended_articles"
reccomended_articles.append(reccomended_article.strip(" "))
#For each div element in the HTML code, run the following
for div in divs:
try:
#If the div has a class called "sideItems", remove unnecessary new lines, and split it into a list
if div.has_attr('class') and div['class'][0] == "sideItems":
unfiltered_reccomended_articles = div.text.strip("\n ").split("\n")
#Runs a function called "filter_text"
filter_text()
#Else if the div has a class called "content-secondary", remove unnecessary new lines, and split it into a list
elif div.has_attr('class') and div['class'][0] == "content-secondary":
unfiltered_reccomended_articles = div.text.strip("\n ").split("\n")
filter_text()
#If an error occurs, ignore it
except:
pass
for footer in footers:
#Gets the text in the footer, remove unnecessary new lines, and split it into a list
footer_text = footer.text.strip("\n ").split("\n")
#Loops through all the hyperlinks in the website
for link in a_links:
#If the hyperlink has a class called "bioLink", set the variable "author" to the link text
if link.has_attr('class') and link['class'][0] == "bioLink":
author = link.text
#Loops through all the span tags in the website
for span in spans:
#If the span has a class called "bold", set the variable "author" to the span text
if span.has_attr('class') and span['class'][0] == "bold":
author = span.text
#Initalizes a list that will contain the contents of the article being web scrapped
article = []
#A list that contains p and a tag classes that will be excluded from the audio file (These classes generally contain off-topic content)
excluded_classes = ["title", "ad-below", "bioLink", "back-top", "newWindow", "socialBio", "contactBio", "socialShareLabel"]
#Loops through each paragraph in the article
for paragraph in paragraphs:
#Gets all the hyperlinks in the paragraph
paragraph_a_links = paragraph.find("a")
#If the paragraph text is in the reccomended_articles or footer_text, ignore it
if paragraph.text.strip("\n ") in reccomended_articles or paragraph.text.strip("\n ") in footer_text:
pass
#If a paragraph has a hyperlink, and the hyperlink's class is in the list called "excluded_classes", ignore it
elif paragraph_a_links != None:
if paragraph_a_links.has_attr('class') and paragraph_a_links['class'][0] in excluded_classes:
pass
else:
#Else, add that paragraph to a list called "article"
article.append(paragraph.text)
#If the paragraph has a class in the list called "excluded_classes", ignore it
elif paragraph.has_attr('class') and paragraph['class'][0] in excluded_classes:
pass
#If the word "SHARE" is in a paragraph, ignore it
elif "SHARE" in paragraph.text:
pass
#Else, add that paragraph into a list called "article"
else:
article.append(paragraph.text)
#Adds the article title, author, and combine all the article paragraphs from the list called "article" into a variable called "the_article"
the_article = article_title + " \n\nThis article is by: " + author + "\n\n" + '\n'.join(article)
#Sets the language and accent of the voice reading the mp3 file
language = 'en-ca'
print("Converting text into audio...")
#Convert "the_article" into an mp3 file
speech = gTTS(text = the_article, lang = language, slow = False)
print("\nSaving text as an audio file...")
speech.save("static/audio/" + str(audio_file_name) + ".mp3")
print("\nThe text has been saved as an audio file called " + str(audio_file_name) + ".mp3")
#Returns the name of the mp3 file
return str(audio_file_name)