In [0]:
import re
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from collections import Counter
# !pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import pandas as pd
# !pip install empath
from empath import Empath

In [0]:
def cleanText(text):
  # removing IP addresses from text
  text = re.sub(r'[0-9]+(?:\.[0-9]+){3}', '', text, flags=re.MULTILINE)

  # removing URLs from text
  text = re.sub(r'http\S+', '', text, flags=re.MULTILINE)

  # remove punctuations and creating text tokens
  nltk_tokenizer = RegexpTokenizer(r'\w+')
  text_tokens = nltk_tokenizer.tokenize(text)

  # removing stop words
  tokens_without_sw = [word for word in text_tokens if not word in stopwords.words()]

  # stemming using snowball stemmer
  stem_tokens =[]
  stemmer = SnowballStemmer("english")
  for token in tokens_without_sw:
      token = stemmer.stem(token)
      if token != "":
        stem_tokens.append(token)

  # joining sentences
  preprocessed_text = ' '.join(word for word in stem_tokens)
  return preprocessed_text

In [0]:
lexicon = Empath()
lexicon.analyze(text, categories=["violence","crime","pride","sympathy","deception","war"])

{'crime': 0.0,
 'deception': 0.0,
 'pride': 0.0,
 'sympathy': 0.0,
 'violence': 1.0,
 'war': 0.0}

In [0]:
# Lexical feature extraction

# length of word
def articleLength(text):
  return len(text)

# average word length
def averageWordLength(text):
  words = text.split()
  average = sum(len(word) for word in words) / len(words)
  return int(average)

# count of numbers
def countOfNumbers(text):
  count_of_numbers = len("".join(re.findall("\d+", text)))
  return count_of_numbers

#count of exclaimation marks
def countOfExclaimationMarks(text):
  count_of_em = len("".join(re.findall("!+", text)))
  return count_of_em 

# get tokens of text
def textTokens(text):
  return nltk.word_tokenize(text)

# count of adjectives
def countOfAdjectives(tokens):
  tags = nltk.pos_tag(tokens)
  tag_count = Counter( tag for word,  tag in tags)
  count_of_adjectives = tag_count['JJ']+tag_count['JJR']+tag_count['JJS'] 
  return count_of_adjectives

# word count
def wordCount(tokens):
  # removing punctuation marks
  punct = re.compile('.*[A-Za-z0-9].*')  # must contain a letter or digit
  filtered_tokens = [w for w in tokens if punct.match(w)]
  return len(filtered_tokens)

In [0]:
# sentimental features
def getSentimentValues(text):
  sid_obj = SentimentIntensityAnalyzer() 
  sentiment_dict = sid_obj.polarity_scores(text)
  return sentiment_dict

In [0]:
# get label value for Liar
def getLabelLiar(label):
  label = label.lower()
  neg_labels = ["pants-fire","false","barely-true"]
  pos_labels = ["half-true","mostly-true","true"]
  if label in neg_labels:
    return "FALSE"
  elif label in pos_labels:
    return "TRUE"

In [0]:
# get label value for Real and Fake
def getLabelRealAndFake(label):
  label = label.lower()
  if label == "fake":
    return "FALSE"
  elif label == "real":
    return "TRUE"

In [0]:
# Feature Extraction
def featureExtraction(dataset,dataset_name):
  # initializing columns as lists
  article=[]
  cleaned_text=[]
  article_length=[]
  avg_word_length=[]
  count_of_numbers=[]
  count_of_em=[]
  count_of_adj =[]
  word_count = []
  neg_sent=[]
  pos_sent=[]
  neut_sent=[]
  text_label=[]

  # setting value of each column by iterating over dataset
  for row in dataset:
    # lexical features
    text = row['statement']
    article.append(text)  #text of statement
    cleaned_text.append(cleanText(text))  #cleaned text of statement
    article_length.append(articleLength(text)) #length of article/statement
    avg_word_length.append(averageWordLength(text)) #average length of word
    count_of_numbers.append(countOfNumbers(text)) #count of numbers in text
    count_of_em.append(countOfExclaimationMarks(text)) #count of exclaimation marks in text
    text_tokens = textTokens(text) #tokens of text
    count_of_adj.append(countOfAdjectives(text_tokens))  #count of adjectives in text
    word_count.append(wordCount(text_tokens))  #word count in text
    
    # sentmiment features
    text_sentiment = getSentimentValues(text)  #get positive negative and neutral sentiment values of text
    neg_sent.append(text_sentiment['neg'])
    pos_sent.append(text_sentiment['pos'])
    neut_sent.append(text_sentiment['neu'])

    # label of text
    if dataset_name == "liar":
      text_label.append(getLabelLiar(row['label']))
    else:
      text_label.append(getLabelRealAndFake(row['label']))
    # break

  # setting the whole dataframe with columns
  features = pd.DataFrame({"Statement":article,
                           "Cleaned Statement":cleaned_text,
                          "Article Length":article_length,
                            "Average Word Length":avg_word_length,
                            "Count of Numbers":count_of_numbers,
                            "Count of Exclaimation Marks":count_of_em,
                            "Count of Adjectives":count_of_adj,
                            "Word Count":word_count,
                            "Negative Sentiment": neg_sent,
                            "Positive Sentiment": pos_sent,
                            "Neutral Sentiment": neut_sent,
                        "Label": text_label})
  return features

In [0]:
# create excel using dataframe
def createCSV(data,path):
  data.to_csv(path,index=False)

In [0]:
# Feature extraction of LIAR dataset
files = ['train.tsv','test.tsv','valid.tsv']
feature_files = ['liar_train_features.csv','liar_test_features.csv','liar_validation_features.csv']
main_path = '/content/drive/My Drive/Fake news detection/preprocessing/liar/'
for i in range(len(files)):
  # read liar file
  dataset = pd.read_csv(main_path+files[i],delimiter='\t',encoding='utf-8', usecols=[1,2], header=0).to_dict(orient='records')
  # extract features
  liar_features = featureExtraction(dataset,'liar') 
  # create csv of features
  createCSV(liar_features,main_path+feature_files[i])
  print("\n "+files[i]+" done")  
                  


 train.tsv done

 test.tsv done

 valid.tsv done


In [14]:
# %%timeit
# read dataset 
main_path = '/content/drive/My Drive/Fake news detection/preprocessing/real_and_fake/'
real_and_fake = pd.read_csv(main_path+'fake_or_real_news.csv', usecols=[2,3], header=0)

# only select stories with lengths gt 1 -- there are some texts with len = 0 and 1
mask = list(real_and_fake['text'].apply(lambda x: len(x) > 1))
real_and_fake = real_and_fake[mask]

# rename column text to statements
real_and_fake.rename(columns={'text':'statement'},inplace=True)
print('Found %s texts.' %real_and_fake['statement'].shape[0])

# change to data to dictionary
real_and_fake = real_and_fake.to_dict(orient='records')

# extract features
features = featureExtraction(real_and_fake,'real_and_fake') 
# create csv of features
createCSV(features,main_path+'fake_or_real_news_features.csv')


Found 6299 texts.


In [15]:
all_labels=[]
for row in real_and_fake:
  text_label = row['label']
  all_labels.append(getLabelRealAndFake(text_label))


6299


In [0]:
df = pd.read_csv(main_path+'fake_or_real_news_features_original.csv')
df["Label"] = all_labels
df.to_csv(main_path+'fake_or_real_news_features_original_with_label.csv', index=False)

In [0]:
# https://colab.research.google.com/github/jakevdp/PythonDataScienceHandbook/blob/master/notebooks/01.07-Timing-and-Profiling.ipynb#scrollTo=FxQa7OOGDIH5
# to get time of the running cell
%%timeit
# to get time of single command
%timeit

The slowest run took 23.02 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 10.9 µs per loop
