In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import re
from sklearn.model_selection import train_test_split

In [27]:
stopwords = stopwords.words('english')

In [121]:
df = pd.read_csv('fake_news_data.csv')
df.head()

Unnamed: 0,title,text,date,fake_or_factual
0,HOLLYWEIRD LIB SUSAN SARANDON Compares Muslim ...,There are two small problems with your analogy...,"Dec 30, 2015",Fake News
1,Elijah Cummings Called Trump Out To His Face ...,Buried in Trump s bonkers interview with New Y...,"April 6, 2017",Fake News
2,Hillary Clinton Says Half Her Cabinet Will Be...,"Women make up over 50 percent of this country,...","April 26, 2016",Fake News
3,Russian bombing of U.S.-backed forces being di...,WASHINGTON (Reuters) - U.S. Defense Secretary ...,"September 18, 2017",Factual News
4,Britain says window to restore Northern Irelan...,BELFAST (Reuters) - Northern Ireland s politic...,"September 4, 2017",Factual News


In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 198 entries, 0 to 197
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   title            198 non-null    object
 1   text             198 non-null    object
 2   date             198 non-null    object
 3   fake_or_factual  198 non-null    object
dtypes: object(4)
memory usage: 6.3+ KB


## Discover target value column:

In [123]:
df['fake_or_factual'].value_counts() * 100 / df.shape[0]

Factual News    50.505051
Fake News       49.494949
Name: fake_or_factual, dtype: float64

We have a nearly balanced target value classes.
50.5% Factual News and 49.5% Fake News.

## Cleaning text column:

### 1. Remove the source of the news if it provided:

We can see there are some news start with the source of the news before the text of the news like "WASHINGTON (Reuters) - U.S. Defense Secretary", So we will extract this part if there is ') -' in the text and add it to another column it can be useful later.

In [124]:
def extract_source(text):
    match = re.match(r"(.*?\))\s*-\s*(.*)", text)
    if match:
        source = match.group(1).strip()
        remaining_text = match.group(2).strip()
        return source, remaining_text
    else:
        return None, text

In [125]:
df[['source', 'no_source_text']] = df['text'].apply(lambda t: pd.Series(extract_source(t)))

In [126]:
df.head()

Unnamed: 0,title,text,date,fake_or_factual,source,no_source_text
0,HOLLYWEIRD LIB SUSAN SARANDON Compares Muslim ...,There are two small problems with your analogy...,"Dec 30, 2015",Fake News,,There are two small problems with your analogy...
1,Elijah Cummings Called Trump Out To His Face ...,Buried in Trump s bonkers interview with New Y...,"April 6, 2017",Fake News,,Buried in Trump s bonkers interview with New Y...
2,Hillary Clinton Says Half Her Cabinet Will Be...,"Women make up over 50 percent of this country,...","April 26, 2016",Fake News,,"Women make up over 50 percent of this country,..."
3,Russian bombing of U.S.-backed forces being di...,WASHINGTON (Reuters) - U.S. Defense Secretary ...,"September 18, 2017",Factual News,WASHINGTON (Reuters),U.S. Defense Secretary Jim Mattis said on Mond...
4,Britain says window to restore Northern Irelan...,BELFAST (Reuters) - Northern Ireland s politic...,"September 4, 2017",Factual News,BELFAST (Reuters),Northern Ireland s political parties are rapid...


### 2. Remove stopwords:

In [127]:
def remove_stopwords(text_series):
    return text_series.apply(lambda s: ' '.join([w for w in s.split() if w not in stopwords]))

In [128]:
df['no_stopwords_text'] = lower_remove_stopwords(df['no_source_text'])

In [129]:
print('Original Text:\n', df['no_source_text'][1][:100])
print('\n---------\nWith no stopwords:\n', df['no_stopwords_text'][1][:100])

Original Text:
 Buried in Trump s bonkers interview with New York Times reporters Maggie Haberman and Glenn Thrush i

---------
With no stopwords:
 Buried Trump bonkers interview New York Times reporters Maggie Haberman Glenn Thrush outrageous exch


### 3. Remove Punctuation:

In [130]:
df['no_stopwords_no_punct_text'] = df['no_stopwords_text'].apply(lambda t: re.sub(r'([^\w\s])', '', t))

In [131]:
print('Original Text:\n', df['no_stopwords_text'][2][:100])
print('\n---------\nWith no punctuation:\n', df['no_stopwords_no_punct_text'][2][:100])

Original Text:
 Women make 50 percent country, grossly underrepresented everything. In United States Congress, 80 pe

---------
With no punctuation:
 Women make 50 percent country grossly underrepresented everything In United States Congress 80 perce


### 4. Lemmatize text:

In [132]:
lemmatizer = WordNetLemmatizer()

In [133]:
df['lemmatized_no_stopwords_no_punct_text'] = df['no_stopwords_no_punct_text'].apply(lambda t: ' '.join(
    [lemmatizer.lemmatize(w) for w in t.split()]))

In [134]:
print('Original Text:\n', df['no_stopwords_no_punct_text'][4][:100])
print('\n---------\nWith no punctuation:\n', df['lemmatized_no_stopwords_no_punct_text'][4][:100])

Original Text:
 Northern Ireland political parties rapidly running time restore devolved powersharing government Bri

---------
With no punctuation:
 Northern Ireland political party rapidly running time restore devolved powersharing government Brita


### 5. Lowering text:

In [135]:
df['lower_lemmatized_no_stopwords_no_punct_text'] = df['lemmatized_no_stopwords_no_punct_text'].str.lower()

### 6. Tokenize text:

In [137]:
df['cleaned_tokens'] = df['lower_lemmatized_no_stopwords_no_punct_text'].apply(lambda t: word_tokenize(t))
df['raw_tokens'] = df['text'].apply(lambda t: word_tokenize(t))

In [139]:
df[['cleaned_tokens', 'raw_tokens']].head()

Unnamed: 0,cleaned_tokens,raw_tokens
0,"[there, two, small, problem, analogy, susan, j...","[There, are, two, small, problems, with, your,..."
1,"[buried, trump, bonkers, interview, new, york,...","[Buried, in, Trump, s, bonkers, interview, wit..."
2,"[women, make, 50, percent, country, grossly, u...","[Women, make, up, over, 50, percent, of, this,..."
3,"[us, defense, secretary, jim, mattis, said, mo...","[WASHINGTON, (, Reuters, ), -, U.S., Defense, ..."
4,"[northern, ireland, political, party, rapidly,...","[BELFAST, (, Reuters, ), -, Northern, Ireland,..."
