# Analysis of our chosen dataset
The purpose of this notebook is to provide some analysis on the dataset that we have chosen to use.

In [4]:
# First let's read in our dataset.
import pandas as pd
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [24]:
# Let's combine the title and text into one column.
df['news'] = df['title'] + ' ' + df['text']
df

Unnamed: 0.1,Unnamed: 0,title,text,label,news
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,The Battle of New York: Why This Primary Matte...
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,REAL,State Department says it can't find emails fro...
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,FAKE,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,FAKE,Anti-Trump Protesters Are Tools of the Oligarc...
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",REAL,"In Ethiopia, Obama seeks progress on peace, se..."


In [25]:
# Now let's convert the 'REAL' or 'FAKE' labels to a binary classification.
convert_to_binary = {'REAL':1,'FAKE':0}
df['label'] = df['label'].map(convert_to_binary)
df

Unnamed: 0.1,Unnamed: 0,title,text,label,news
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,1,Kerry to go to Paris in gesture of sympathy U....
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",0,Bernie supporters on Twitter erupt in anger ag...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,1,The Battle of New York: Why This Primary Matte...
...,...,...,...,...,...
6330,4490,State Department says it can't find emails fro...,The State Department told the Republican Natio...,1,State Department says it can't find emails fro...
6331,8062,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...,0,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332,8622,Anti-Trump Protesters Are Tools of the Oligarc...,Anti-Trump Protesters Are Tools of the Oligar...,0,Anti-Trump Protesters Are Tools of the Oligarc...
6333,4021,"In Ethiopia, Obama seeks progress on peace, se...","ADDIS ABABA, Ethiopia —President Obama convene...",1,"In Ethiopia, Obama seeks progress on peace, se..."


In [29]:
# Here we remove the unnecessary columns, so we just have the text input and binary classification output columns.
df = df.drop([df.columns[0],df.columns[1],df.columns[2]],axis=1)
df

Unnamed: 0,label,news
0,0,You Can Smell Hillary’s Fear Daniel Greenfield...
1,0,Watch The Exact Moment Paul Ryan Committed Pol...
2,1,Kerry to go to Paris in gesture of sympathy U....
3,0,Bernie supporters on Twitter erupt in anger ag...
4,1,The Battle of New York: Why This Primary Matte...
...,...,...
6330,1,State Department says it can't find emails fro...
6331,0,The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332,0,Anti-Trump Protesters Are Tools of the Oligarc...
6333,1,"In Ethiopia, Obama seeks progress on peace, se..."


In [38]:
# It's now time for a bit of preprocessing...
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
WNL = WordNetLemmatizer()

for index, row in df.iterrows():
    filter_sentence = ''
    sentence = row['news']
    # Tokenization
    words = nltk.word_tokenize(sentence)
    # Stopwords removal
    words = [word for word in words if not word in stop_words]
    # Lemmatization
    for word in words:
        filter_sentence = filter_sentence  + ' ' + str(WNL.lemmatize(word)).lower()
    filter_sentence = filter_sentence[1:]
    
    df.loc[index, 'news'] = filter_sentence
df.head()

Unnamed: 0,label,news
0,0,"smell hillary ’ fear daniel greenfield , shill..."
1,0,watch exact moment paul ryan committed politic...
2,1,kerry go paris gesture sympathy u.s. secretary...
3,0,bernie supporter twitter erupt anger dnc : 'we...
4,1,battle new york : primary matter 's primary da...


In [64]:
# We now want to find the most common words used in the news articles.
word_count = {}
for index, row in df.head(100).iterrows():
    sentence = row['news']
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word in word_count:
            word_count[word] += 1
        else:
            word_count[word] = 1
word_count    

{'smell': 4,
 'hillary': 112,
 '’': 849,
 'fear': 15,
 'daniel': 1,
 'greenfield': 1,
 ',': 3832,
 'shillman': 1,
 'journalism': 3,
 'fellow': 10,
 'freedom': 20,
 'center': 31,
 'new': 181,
 'york': 54,
 'writer': 5,
 'focusing': 4,
 'radical': 4,
 'islam': 15,
 '.': 3139,
 'final': 15,
 'stretch': 21,
 'election': 63,
 'rodham': 2,
 'clinton': 255,
 'gone': 15,
 'war': 73,
 'fbi': 67,
 'word': 20,
 '“': 487,
 'unprecedented': 8,
 '”': 485,
 'thrown': 2,
 'around': 33,
 'often': 15,
 'ought': 3,
 'retired': 5,
 'still': 67,
 'nominee': 42,
 'major': 19,
 'political': 73,
 'party': 179,
 'go': 54,
 'exactly': 8,
 'people': 152,
 'done': 28,
 'coma': 1,
 'patient': 3,
 'waking': 1,
 'watching': 4,
 'hour': 19,
 'cnn': 25,
 'hospital': 12,
 'bed': 5,
 'would': 145,
 'assume': 2,
 'director': 20,
 'james': 9,
 'comey': 21,
 'opponent': 11,
 'attack': 67,
 'everyone': 24,
 'obama': 241,
 'circulated': 1,
 'letter': 6,
 'attacking': 6,
 'currently': 6,
 'medium': 34,
 'hit': 14,
 'piece': 6

In [80]:
word_count_df = pd.DataFrame(word_count.items(), columns = ['word','count'])
word_count_df = word_count_df.sort_values(by='count',ascending=False).reset_index().drop('index',axis=1)
word_count_df.head(20)

Unnamed: 0,word,count
0,",",3832
1,.,3139
2,’,849
3,``,577
4,“,487
5,”,485
6,'s,370
7,:,350
8,trump,328
9,said,312


We clearly need to remove punctuation.