In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import re 
import urllib
import time

In [2]:
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)

In [3]:
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
    driver.get(url)
    res_html = driver.execute_script('return document.body.innerHTML')
    soup = BeautifulSoup(res_html,'html.parser') #beautiful soup object to be used for parsing html content
    return soup


#tidies extracted text 
def process_news(news):
    news = news.encode('ascii',errors='ignore').decode('utf-8')       #removes non-ascii characters
    news = re.sub('\s+',' ',news)       #repalces repeated whitespace characters with single space
    news = re.sub(r'[^\w\s]', '', news) # Removing punctuations in string 
    return news

''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements. 
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
    for script in soup(["script", "style"]):
        script.decompose()
    return soup

In [4]:
def get_urls(url_doc_path):
    
    with open(url_doc_path) as f:
        urls = f.readlines()
        # you may also want to remove whitespace characters like `\n` at the end of each line
    urls = [x.strip() for x in urls]
    print(urls)
    return urls

In [7]:
def scrape_news(url, driver):
    soup = get_js_soup(url,driver)
    this_news = []
    if 'bbc' in url:
        #news_title = soup.find_all('div',class_='mxb')
        #news_content = news_title.next_sibling
        #print(news_content)
        #cands = news_content.find_all('p')
        this_news.append('0')
        cands = soup.find_all('p')
        for cand in cands:
            paragraph = process_news(cand.get_text(separator=' ').lower())
            this_news.append(paragraph)
        
    
    elif 'cnn' in url:
        this_news.append('1')
        news_content = soup.find('div', class_ = 'cnnStoryContent') 
        cands = news_content.find_all('p')
        for cand in cands:
            paragraph = process_news(cand.get_text(separator=' ').lower())
            this_news.append(paragraph)
    
    review_content = soup.find('div', class_-)
    news = ' '.join(this_news)        
        
    print(news)
    return news

In [8]:
news_list = []
urls = get_urls('news_url.txt')
for url in urls:
    news = scrape_news(url, driver)
    news_list.append(news)
driver.close()

['http://news.bbc.co.uk/2/hi/middle_east/2862273.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3756650.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3722255.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3702710.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3317429.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3329671.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3265931.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3319993.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3238431.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3274613.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3321785.stm', 'http://news.bbc.co.uk/2/hi/middle_east/3324631.stm', 'http://news.bbc.co.uk/2/hi/americas/3254714.stm', 'http://news.bbc.co.uk/2/hi/americas/3326311.stm', 'http://news.bbc.co.uk/2/hi/uk_news/scotland/3230571.stm', 'https://edition.cnn.com/2003/US/12/12/sprj.nirq.west.ruling/', 'https://edition.cnn.com/2003/WORLD/asiapcf/east/12/09/japan.troops/', 'https://edition.cnn.com/2003/ALLPOLITICS/10/28/mission

0  brig gen mark kimmitt said the blast had caused a small release of the substance and two people had been treated for exposure to the agent   the substance was found in a shell inside a bag discovered by a us convoy a few days ago he said   it appears to be the first evidence of nerve gas existing in iraq since the start of the usled war last year   limited effect   the 155mm artillery round had been set up as a roadside bomb and it exploded before the us military were able to defuse it    deadly sarin invented in 1930s germany 20 times deadlier than cyanide attacks nervous system pin headsized drop can kill used in iraq in 1980s and in 1995 japan subway attack antidotes are available qa sarin bomb reopens wmd debate   gen kimmitt said the dispersal of the nerve agent from a device such as the homemade bomb was limited   the former regime had declared all such rounds destroyed before the 1991 gulf war he said   however a senior coalition source has told the bbc the round does not sig

0  abdul aziz alhakim said further discussion was needed to decide what if anything iraq would pay itself   iran claims 100bn in reparations for the brutal eightyear war that claimed about one million lives   mr hakims remarks may augur improving iraniraq relations now saddam hussein is in custody   the prominent iraqi is also the head of the supreme council for the islamic revolution in iraq sciri the most important shia muslim party represented on the governing council   sciri has close ties with tehran where the party was based during saddam husseins years in power   analysts say that with iraqs shia majority likely to dominate in any future democratic government it is logical that baghdad should now develop warmer relations with its shia neighbours in iran   mustard gas   in 1980 after a series of border skirmishes following irans islamic revolution iraq invaded iran   according to the un iran deserves reparations she must be satisfied abdul aziz alhakim on this day iraq bombs iran

0  the iraqi leader had been convinced by french and russian contacts that there would be no us land invasion   exiraqi deputy prime minister tariq aziz made the claims under questioning the post said citing us interrogators   us officials say mr aziz once the international face of the iraqi regime is not a reliable witness     nevertheless the washington post notes his cooperation with his questioners is important as the usled coalition tries to understand saddam husseins behaviour   after meeting russian and french intermediaries saddam hussein was convinced he could avoid a war mr aziz reportedly told interrogators   they assured the iraqi president that they would block a usled war through delays and vetoes at the united nations security council according to mr azizs reported statements  distracted   the french and russians then convinced saddam hussein that washington would wage a long air war first as it had done in previous conflicts mr aziz is said to have told questioners   pr

0  the us military suffered its worst monthly death toll since the end of major combat in iraq losing 79 soldiers in november mostly in enemy attacks   casualties like sharon swartworth are commemorated in the media but the cries to pull troops out have not grown louder and indeed both public and politicians seem prepared to accept the setbacks as part of a longerterm battle which needs fighting   some pundits and media observers seem keen to find similarities between the ongoing operation in iraq and the vietnam war which dragged on amid mounting public opposition until the us decided nothing more could be gained and pulled out   but there simply is not the depth of feeling and interest as there was 30 years ago when newspapers and broadcasts were dominated by the war when some sections of the public were so outraged by what was happening that they spat at returning soldiers and when losses reached the tens of thousands   we can take these kinds of casualties indefinitely as long as p

1  saturday december 13 2003 posted 1116 gmt  716 pm hkt   tikrit iraq cnn  the commanding general of the 4th infantry division on friday accepted a us military investigators recommendation and ordered administrative action against lt col allen west who was accused of using improper methods to force information out of an iraqi detainee   following a military hearing west was fined 5000 over two months according to wests civillian attorney neal puckett   the punishment does not affect wests eligibility for retirement and pension puckett said in a statement   west 42 will be assigned to the rear detachment of the 4th infantry division awaiting the processing of his retirement request the statement said   maj gen raymond odierno the 4th infantrys top general in tikrit could have rejected the recommendation and ordered a court martial if he were to be found guilty at a court martial of the two articles against him west could have faced 11 years in prison a military prosecutor told cnn   th

1  sunday november 16 2003 posted 0734 gmt  334 pm hkt   cnn  seventeen us soldiers were killed five were injured and one was missing when two us black hawk helicopters crashed saturday in a residential neighborhood in the northern iraqi city of mosul   military officials believe one helicopter may have climbed to avoid gunfire and collided with a second black hawk causing them both to crash   authorities had no reports of iraqi casualties but a military official said the crashes set buildings on fire   initial reports indicated that when the first helicopter rose to avoid groundfire it caused a rotor strike with the second helicopter a military source told cnn   the two uh60 black hawks crashed at about 630 pm 1030 am est in the western part of the city col william darley said he said soldiers from the 101st airborne division iraqi police and iraqi firefighters were on the scene   the deaths brought the number of american soldiers killed in the iraq war to 422 of those 283 have died s

1  by alphonso van marsh cnn sunday december 28 2003 posted 1903 gmt  303 am hkt   tikrit iraq cnn  in the aftermath of saddam husseins capture the commander of the dramatic raid us army col james hickey has become a reluctant media celebrity   on a recent return to the farm near tikrit where saddam was found december 13 news photographers fawned over the 42yearold leader of the 4th infantry divisions 1st brigade a steelyeyed colonel from chicago illinois   its a little bit embarrassing hickey says of the attention   images of the military leader congratulating his troops and celebrating moments after the arrest were broadcast around the world   i collected my troops together hickey says i told the soldiers what we had done and the significance of what we had accomplished not only in terms of the mission here but also in army history   the virginia military institute and johns hopkins universityeducated colonel has a reputation for talking tough   i am a little oldfashioned in doing th

1  monday december 1 2003 posted 0939 gmt  539 pm hkt   tikrit iraq cnn  us troops fought off two simultaneous attacks on military convoys sunday in northern iraq killing 54 attackers wounding 18 and capturing eight others military officials said   during the gunfights us tanks crushed makeshift barricades set up by the guerrillas and destroyed three buildings from which the iraqi fighters were launching attacks   some of the attackers appeared to be wearing the black uniforms of the fedayeen saddam a militia loyal to ousted iraqi leader saddam hussein said master sgt robert cargie a spokesman for the armys 4th infantry division   the convoys carrying military supplies and iraqi dinars came under attack in samarra about 75 miles 120 kilometers north of baghdad the armys 4th infantry division said   this was a coordinated simultaneous attack cargie said with one convoy being attacked on the east side of the city and the other coming under fire while in the west side of samarra   guerril

1  monday december 15 2003 posted 0513 gmt  113 pm hkt   baghdad iraq cnn  across the tigris river from his opulent palaces saddam hussein shuttered himself at the bottom of a narrow dark hole beneath a tworoom mud shack on a sheep farm a us military official said sunday   having opted not to travel with security forces or an entourage that might bring attention to him only a styrofoam square dirt and a rug separated the deposed iraqi leader from the us soldiers who routed him from his hiding place saturday night   he was in the bottom of a hole with no way to fight back said maj gen raymond odierno he was caught like a rat   saddams capture was based not on a direct tip but a collection of intelligence gathered from the hostile questioning of saddams former bodyguards and family members us officials said   that intelligence prompted us soldiers to go to adwar about 15 kilometers nine miles from tikrit saddams ancestral home   we realized early on in the summer the people we had to get

1  wednesday november 26 2003 posted 0229 gmt 1029 am hkt   washington cnn  a videotape given to cnn by a french journalist purports to show a damaged cargo plane landing at baghdads airport after being hit by a shoulderfired missile but officials tuesday dismissed the tape as a likely propaganda ploy   the tape which journalist sara daniel obtained in iraq shows a man firing what appears to be a sovietera surfacetoair missile daniel a correspondent for the french newsmagazine le nouvel observateur said she believes the tape came from iraqi guerrillas she met outside baghdad   no aircraft or any impact is visible as the missile streaks into the sky but the videotape cuts to images of a cargo plane landing at the baghdad airport the aircraft appears to be damaged   a dhl cargo flight was struck by a missile shortly after taking off from the airport saturday forcing it to land with a damaged left wing the incident prompted the usled occupation government in iraq to suspend civilian air t

1  thursday january 22 2004 posted 1902 gmt  302 am hkt   baghdad iraq cnn  insurgents in iraqs volatile sunni triangle launched three deadly attacks during a 24hour period killing two us soldiers three iraqi police officers and four civilians   the sunni triangle is iraqs most volatile region an area north and west of baghdad that is a hotbed of opposition to the usled coalition and scene of political instability   in the most recent attack three iraqi police officers and a civilian were killed thursday at a highway checkpoint between the central cities of fallujah and ramadi an iraq police official said   suspected insurgents in two pickups with medium to heavy machine guns opened fire on the police checkpoint said maj walid jalal an official with the iraqi highway patrol force based in fallujah   a grenade was thrown at a police vehicle parked at the checkpoint jalal said another police officer was wounded in the attack he said   in other violence a mortar and rocket attack on a us 

In [9]:
def write_lst(lst,file_):
    with open(file_,'w') as f:
        for l in lst:
            f.write(l)
            f.write('\n')

In [10]:
news_content_file = 'wars_news.txt'
write_lst(news_list, news_content_file)