In [223]:
import pandas as pd 
import nltk
import string

from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [224]:
df = pd.read_csv("C:\\Users\\Eli\\Desktop\\train.csv", index_col=0) #index_col=0)

In [225]:
df = df.dropna()
df.reset_index(drop=True, inplace= True)

In [226]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18285 entries, 0 to 18284
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   18285 non-null  object
 1   author  18285 non-null  object
 2   text    18285 non-null  object
 3   label   18285 non-null  int64 
dtypes: int64(1), object(3)
memory usage: 571.5+ KB


In [227]:
df['label'] = df['label'].replace({1:'Unreliable', 0:'reliable'}) 

In [228]:
#removing HTML data
def remove_html(text):
    soup = BeautifulSoup(text)
    html_free = soup.get_text()
    return html_free

In [200]:
df['text'] = df['text'].apply(lambda x: remove_html(x))
df['text'].head(20)

0     House Dem Aide: We Didn’t Even See Comey’s Let...
1     Ever get the feeling your life circles the rou...
2     Why the Truth Might Get You Fired October 29, ...
3     Videos 15 Civilians Killed In Single US Airstr...
4     Print \nAn Iranian woman has been sentenced to...
5     In these trying times, Jackie Mason is the Voi...
6     PARIS  —   France chose an idealistic, traditi...
7     A week before Michael T. Flynn resigned as nat...
8     Organizing for Action, the activist group that...
9     The BBC produced spoof on the “Real Housewives...
10    The mystery surrounding The Third Reich and Na...
11    Clinton Campaign Demands FBI Affirm Trump's Ru...
12    Yes, There Are Paid Government Trolls On Socia...
13    Guillermo Barros Schelotto was not the first A...
14    The scandal engulfing Wells Fargo toppled its ...
15    A Caddo Nation tribal leader has just been fre...
16    FBI Closes In On Hillary! Posted on Home » Hea...
17    Wednesday after   Donald Trump’s press con

In [229]:
def remove_punctuation(text):
    no_punctuation = "".join([c for c in text if c not in string.punctuation])
    return no_punctuation

In [202]:
df['text'] = df['text'].apply(lambda x: remove_punctuation(x))


In [230]:
tokenizer = RegexpTokenizer(r'\w+')

In [204]:
df['text'] = df['text'].apply(lambda x: tokenizer.tokenize(x.lower()))


In [231]:
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words

In [206]:
df['text'] = df['text'].apply(lambda x: remove_stopwords(x))


In [207]:
df['text']

0        [house, dem, aide, even, see, comey, letter, j...
1        [ever, get, feeling, life, circles, roundabout...
2        [truth, might, get, fired, october, 29, 2016, ...
3        [videos, 15, civilians, killed, single, us, ai...
4        [print, iranian, woman, sentenced, six, years,...
                               ...                        
18280    [rapper, unloaded, black, celebrities, met, do...
18281    [green, bay, packers, lost, washington, redski...
18282    [macy, today, grew, union, several, great, nam...
18283    [nato, russia, hold, parallel, exercises, balk...
18284    [david, swanson, author, activist, journalist,...
Name: text, Length: 18285, dtype: object

In [232]:
stemmer = PorterStemmer()

In [233]:
def word_stemmer(text):
    stem_text = " ".join([stemmer.stem(i) for i in text])
    return stem_text 

In [216]:
df['text'] = df['text'].apply(lambda x: word_stemmer(x))


In [217]:
df['text']

0        hous dem aid even see comey letter jason chaff...
1        ever get feel life circl roundabout rather hea...
2        truth might get fire octob 29 2016 tension int...
3        video 15 civilian kill singl us airstrik ident...
4        print iranian woman sentenc six year prison ir...
                               ...                        
18280    rapper unload black celebr met donald trump el...
18281    green bay packer lost washington redskin week ...
18282    maci today grew union sever great name america...
18283    nato russia hold parallel exercis balkan 11022...
18284    david swanson author activist journalist radio...
Name: text, Length: 18285, dtype: object

In [135]:
df.to_csv("C:\\Users\\Eli\\Desktop\\news_processed.csv")

In [235]:
def preprocess_text(x):
    x = remove_html(x)
    x = remove_punctuation(x)
    x = tokenizer.tokenize(x.lower())
    x = remove_stopwords(x)
    x = word_stemmer(x)
    return x

In [236]:
df['text'] = df['text'].apply(lambda x: preprocess_text(x))

In [237]:
df['text']

0        hous dem aid even see comey letter jason chaff...
1        ever get feel life circl roundabout rather hea...
2        truth might get fire octob 29 2016 tension int...
3        video 15 civilian kill singl us airstrik ident...
4        print iranian woman sentenc six year prison ir...
                               ...                        
18280    rapper unload black celebr met donald trump el...
18281    green bay packer lost washington redskin week ...
18282    maci today grew union sever great name america...
18283    nato russia hold parallel exercis balkan 11022...
18284    david swanson author activist journalist radio...
Name: text, Length: 18285, dtype: object

In [213]:
print(preprocess_text('''US President Donald Trump has said quarantining New York "will not be necessary", after the state's governor said doing so would be "preposterous".

Mr Trump said the latest decision was taken on the recommendation of the White House Coronavirus Task Force.

The president had earlier said he might impose a quarantine on New York, and parts of New Jersey and Connecticut, to slow the spread of Covid-19.

There are more than 52,000 cases in New York.

The state has about half of the total confirmed Covid-19 cases in the entire US.

Mr Trump tweeted that instead of quarantine, a "strong travel advisory" would be issued to New York, New Jersey and Connecticut by the Centers for Disease Control and Prevention (CDC).

Live updates: Trump U-turn and Boris warning to UK
The CDC then published a statement urging residents of those three states to "refrain" from all non-essential domestic travel for 14 days.

The agency said the advisory did not apply to "critical infrastructure" service providers, including healthcare professionals and food suppliers.

Image Copyright @realDonaldTrump@REALDONALDTRUMP
Report
Speaking to reporters earlier on Saturday about the situation in New York, Mr Trump said: "We'd like to see [it] quarantined because it's a hotspot... I'm thinking about that."

He said it would be aimed at slowing the spread of the virus to other parts of the US.

"They're having problems down in Florida. A lot of New Yorkers are going down. We don't want that," he said.

What did New York's governor say?
New York Governor Andrew Cuomo responded by saying that quarantining the state of New York would be "preposterous" and "anti-American".

"If you said we were geographically restricted from leaving, that would be a lockdown."

Image copyrightEPA
Image caption
New York's governor expressed concern at the idea of a quarantine
He said New York had already implemented "quarantine" measures, such as banning major gatherings and ordering people to remain at home, but that he would oppose any "lockdown" efforts.

"Then we would be Wuhan, China, and that wouldn't make any sense," he told CNN, adding that this would cause the stock market to crash in a way that would make it impossible for the US economy to "recover for months, if not years".

"You would paralyse the financial sector," he said.

He told a press briefing earlier on Saturday: "I don't know how that can be legally enforceable. And from a medical point of view, I don't know what you would be accomplishing.

"But I can tell you, I don't even like the sound of it."

Mr Cuomo also said he would sue nearby Rhode Island if the authorities there continued targeting New Yorkers and threatening to punish them for failing to quarantine.

Image Copyright @RyanWelchPhotog@RYANWELCHPHOTOG
Report
On Friday, Rhode Island Governor Gina Raimondo deployed National Guard troops to stop cars with a New York licence plate, to remind them of their state's advice that they quarantine.

Soldiers are going door-to-door in coastal vacation communities to ask if any residents have recently visited New York City.

A SIMPLE GUIDE: What are the symptoms?
AVOIDING CONTACT: Should I self-isolate?
STRESS: How to protect your mental health
MAPS AND CHARTS: Visual guide to the outbreak
The White House has said anyone leaving New York City should self-isolate for 14 days.

What's the latest in the US?
With more than 2,000 virus-related fatalities, the US death toll remains lower than those in Italy and China. But there are virus hotspots in New York, New Orleans, Detroit and Seattle.

Saturday saw the first death in the US of an infant who had tested positive for coronavirus. The baby died in Chicago.

In his press briefing, Mr Cuomo said New York was postponing its presidential primary by almost two months until 23 June as a result of the outbreak.

He also said the apex of the crisis would occur in the next 14 to 21 days.

Mr Cuomo said the state would soon require 30,000 respiratory ventilators, which had increased in price to $45,000 (£36,000) each due to demand.

He added that Mr Trump had approved the construction of four temporary hospitals.

Demand for ventilators has also doubled in the southern state of Louisiana. Governor John Bel Edwards said New Orleans would run out of ventilators by 2 April and possibly run out of hospital beds by 7 April if the number of new infections did not subside.

"It's not some flimsy theory. This is what is going to happen," he said.

President Trump has ordered a car manufacturer in Detroit to produce more ventilators.


Media captionCoronavirus: Millions of Americans unemployed
Hospitals in New York City are rapidly running out of medical equipment and personal protective gear. More widely, the mayors of most US cities have said they expect massive shortages of critical personal safety equipment in the coming weeks.

On Saturday, Mr Trump watched as the USNS Comfort, a navy hospital ship with 1,000 beds aboard, left for New York from Virginia. It will station itself at a Manhattan pier to deal with the overload of patients that New York expects.

It came after Mr Trump signed a $2.2 trillion (£1.8tr) bailout bill passed by Congress on Friday, the largest fiscal stimulus in US history.'''))

us presid donald trump said quarantin new york necessari state governor said would preposter mr trump said latest decis taken recommend white hous coronaviru task forc presid earlier said might impos quarantin new york part new jersey connecticut slow spread covid19 52000 case new york state half total confirm covid19 case entir us mr trump tweet instead quarantin strong travel advisori would issu new york new jersey connecticut center diseas control prevent cdc live updat trump uturn bori warn uk cdc publish statement urg resid three state refrain nonessenti domest travel 14 day agenc said advisori appli critic infrastructur servic provid includ healthcar profession food supplier imag copyright realdonaldtrumprealdonaldtrump report speak report earlier saturday situat new york mr trump said wed like see quarantin hotspot im think said would aim slow spread viru part us theyr problem florida lot new yorker go dont want said new york governor say new york governor andrew cuomo respond s