In [1]:
import pandas as pd

data = pd.read_csv('FakeNewsNet.csv')
print("Shape of the dataset: ", data.shape)
data.head()

Shape of the dataset:  (23196, 5)


Unnamed: 0,title,news_url,source_domain,tweet_num,real
0,Kandi Burruss Explodes Over Rape Accusation on...,http://toofab.com/2017/05/08/real-housewives-a...,toofab.com,42,1
1,People's Choice Awards 2018: The best red carp...,https://www.today.com/style/see-people-s-choic...,www.today.com,0,1
2,Sophia Bush Sends Sweet Birthday Message to 'O...,https://www.etonline.com/news/220806_sophia_bu...,www.etonline.com,63,1
3,Colombian singer Maluma sparks rumours of inap...,https://www.dailymail.co.uk/news/article-33655...,www.dailymail.co.uk,20,1
4,Gossip Girl 10 Years Later: How Upper East Sid...,https://www.zerchoo.com/entertainment/gossip-g...,www.zerchoo.com,38,1


In [2]:
data.isnull().sum()

title              0
news_url         330
source_domain    330
tweet_num          0
real               0
dtype: int64

In [3]:
data = data.drop(["news_url", "tweet_num"], axis=1)
data['source_domain'] = data['source_domain'].fillna('Unknown')
data.isnull().sum()

title            0
source_domain    0
real             0
dtype: int64

In [4]:
data['real'].value_counts()

real
1    17441
0     5755
Name: count, dtype: int64

In [5]:
real_data = data[data['real'] == 1]
fake_data = data[data['real'] == 0]

real_sample = real_data.sample(n=len(fake_data), random_state=42)

balanced_data = pd.concat([real_sample, fake_data])
balanced_data.head()

Unnamed: 0,title,source_domain,real
4894,Fergie And Josh Duhamel Split After 8 Years Of...,www.huffingtonpost.com,1
20804,John Dickerson Replacing Charlie Rose on 'CBS ...,www.hollywoodreporter.com,1
594,"Prince William Says He Is Still ""Working On"" a...",www.brides.com,1
28,WATCH: Kendall Jenner’s first-ever Adidas ad,www.channel24.co.za,1
17491,"Was Halsey ""Bamboozled"" Into Dating G-Eazy?!",www.msn.com,1


In [6]:
balanced_data['real'].value_counts()

real
1    5755
0    5755
Name: count, dtype: int64

In [7]:
balanced_data = balanced_data.drop(['source_domain'], axis=1)
balanced_data.head()

Unnamed: 0,title,real
4894,Fergie And Josh Duhamel Split After 8 Years Of...,1
20804,John Dickerson Replacing Charlie Rose on 'CBS ...,1
594,"Prince William Says He Is Still ""Working On"" a...",1
28,WATCH: Kendall Jenner’s first-ever Adidas ad,1
17491,"Was Halsey ""Bamboozled"" Into Dating G-Eazy?!",1


In [8]:
import nltk, string

nltk.download('punkt_tab')

balanced_data['num_characters'] = balanced_data['title'].apply(len)
balanced_data['num_words'] = balanced_data['title'].apply(lambda x: len(nltk.word_tokenize(x)))
balanced_data['num_sentences'] = balanced_data['title'].apply(lambda x: len(nltk.sent_tokenize(x)))

balanced_data.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/skakibahammed/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,title,real,num_characters,num_words,num_sentences
4894,Fergie And Josh Duhamel Split After 8 Years Of...,1,55,10,1
20804,John Dickerson Replacing Charlie Rose on 'CBS ...,1,59,10,1
594,"Prince William Says He Is Still ""Working On"" a...",1,117,23,1
28,WATCH: Kendall Jenner’s first-ever Adidas ad,1,44,9,1
17491,"Was Halsey ""Bamboozled"" Into Dating G-Eazy?!",1,44,10,2


In [9]:
def transform_text(text):
  text = text.lower()
  text = nltk.word_tokenize(text)

  y = []
  for i in text:
    if i.isalnum():
      y.append(i)

  text = y[:]
  y.clear()

  for i in text:
    if i not in nltk.corpus.stopwords.words('english') and i not in string.punctuation:
      y.append(i)

  text = y[:]
  y.clear()

  for i in text:
    y.append(nltk.stem.PorterStemmer().stem(i))

  return " ".join(y)

balanced_data['transformed_text'] = balanced_data['title'].apply(transform_text)

balanced_data.head()

Unnamed: 0,title,real,num_characters,num_words,num_sentences,transformed_text
4894,Fergie And Josh Duhamel Split After 8 Years Of...,1,55,10,1,fergi josh duhamel split 8 year marriag
20804,John Dickerson Replacing Charlie Rose on 'CBS ...,1,59,10,1,john dickerson replac charli rose morn
594,"Prince William Says He Is Still ""Working On"" a...",1,117,23,1,princ william say still work name new royal ma...
28,WATCH: Kendall Jenner’s first-ever Adidas ad,1,44,9,1,watch kendal jenner adida ad
17491,"Was Halsey ""Bamboozled"" Into Dating G-Eazy?!",1,44,10,2,halsey bamboozl date


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

x = tfidf.fit_transform(balanced_data['transformed_text']).toarray()
y = balanced_data['real'].values

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=2)