In [1]:
import sys
import pandas as pd
import numpy as np
import nltk
import statistics as stats
import re
from collections import defaultdict, Counter
import math

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download("wordnet")
nltk.download('omw-1.4')
import sys
pd.options.mode.chained_assignment = None

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jacob\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
df = pd.read_csv("news_sample.csv")#[:10]
df.drop_duplicates(subset='content', inplace=True,ignore_index=True)

In [3]:
def zipfsFiltering(df,quantiles=[0.05,0.95],generateGraph=True):
  #df["content"] = df["content"].lower()
  for y in ["content"]:
    tokens = nltk.tokenize.word_tokenize(df[y])
    allWordsDist = nltk.FreqDist(w.lower() for w in tokens)

    words = [[word,dict(allWordsDist.most_common())[word]] for word in dict(allWordsDist.most_common()) if word.isalpha()]
    words = sorted(words,key=lambda k: k[1],reverse=True)
      #print(words)

    wordCount = [x[1] for x in words]
    lower = int(np.percentile(wordCount,100*(quantiles[0])))
    upper = int(np.percentile(wordCount,100*(quantiles[1])))
    
    for word in words:
      if word[1] >= upper:
        df[y] = df[y].replace(f" {word[0]} "," ")
        words.remove(word)
      elif word[1] <= lower:
        df[y] = df[y].replace(f" {word[0]} "," ")
        words.remove(word)
    
      
  return df

In [4]:
def removeStopwords(df):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(df["content"])
    df["content"] = ' '.join([word for word in tokens if not word in stop_words])
    return df

In [5]:
def keywordFiltering(df):
    for y in ["content"]:
        try:
            df[y] = str(df[y]).lower() #make lowercase
            df[y] = re.sub(r"\t"," ",str(df[y])) #Remove tab
            df[y] = re.sub(r"\n"," ",str(df[y])) #Remove newline
            
            #df[y] = re.sub(r"\bhttp.*[^ ]","<URL>",str(df[y])) #Remove Url
            #df[y] = re.sub(r"www\..+?","<URL> ",str(df[y]))
            #df[y] = re.sub(r"\b.*\.com.*\b","<URL>",str(df[y]))  
            df[y] = re.sub(r"((http://|https://)*(www\.)*([\w\d\._-]+)(\.[\w]{2,})(\.)*?(/[\w\d#%=&/?\.+_-]+)*(\.[\w]+)*)",
                                "<URL>", str(df[y]))              
            
            df[y] = re.sub(r"\d{4}[-|\/|\\]\d{2}[-|\/|\\]\d{2}\b","<DATE>",str(df[y])) #Remove Date
            df[y] = re.sub(r"\b\d{2}[-|\/|\\]{1}\d{2}[-|\/|\\]{1}\d{2}\b","<DATE>",str(df[y])) 
            df[y] = re.sub(r"\b\d{2}[-|\/|\\]{1}\d{2}[-|\/|\\]{1}\d{4}\b","<DATE>",str(df[y]))
            df[y] = re.sub(r"((jan[uary]*|feb[ruary]*|mar[ch]*|apr[il]*|may|jun[e]*|jul[y]*|aug[ust]*|sep[tember]*|oct[ober]*|nov[ember]*|dec[ember]*) ([\d]+(\w{2})*) ?(rd|st|th+))",
                                "<DATE>", str(df[y]))
            df[y] = re.sub(r"\d{1,2}?(rd|st|th)", "<DATE>", str(df[y])) #match format num(th, rd, st)
                
            df[y] = re.sub(r"\b[\w\.\-]+[\d\w]+?[@][\w]+?[\.][a-z]{2,}\b", "<EMAIL>", str(df[y])) #Remove email 
            
            #df[y] = re.sub(r".+@.+","<Twitter>",str(df[y])) #Can remove twitter 
            
            df[y] = re.sub(r"[0-9]+[\.|,|:|0-9]*","<NUM>",str(df[y])) #Remove num

            df[y] = re.sub(r"[^\s\w\d]", "", str(df[y])) #remove punctuation
            df[y] = re.sub(r" {2,}"," ",str(df[y])) #Remove extra white space

        except:
            if not isinstance(df["content"],str):
                df[y] = ""
    return df

In [6]:
def applyStemming(df):
    tokens = word_tokenize(df["content"])
    tokenTags = nltk.tag.pos_tag(tokens)
    ret = []
    for (x,y) in tokenTags:
        if "VB" in y:
            ret.append(PorterStemmer().stem(x))
        else:
            ret.append(WordNetLemmatizer().lemmatize(x))
    df["content"] = ' '.join(ret)
    return df

In [7]:
def exploringData(df):
    propNounsFake = 0
    fakeTotal = 0

    propNounsElse = 0
    elseTotal = 0


    for y in ["content"]:
        for x in range(0,len(df)):
            try:
                if df["type"][x].lower() == 'fake':
                    fakeTotal +=1
                    sentences = sent_tokenize(df[y][x])
                    words = [word_tokenize(sentence.lower()) for sentence in sentences]
                    words = words[0]
                    print(words)

                    tagged_words = [nltk.pos_tag(sentence) for sentence in words]
                    print(f"tags: {tagged_words}")
                    proper_nouns = []
                    for sentence in tagged_words:
                        for word, tag in sentence:
                            if tag == 'NNP': # NNP denotes proper noun
                                proper_nouns.append(word)
                    print(proper_nouns)
                    propNounsFake += len(set(proper_nouns))


                else:
                    elseTotal +=1
                    sentences = sent_tokenize(df[y][x])
                    words = [word_tokenize(sentence.lower()) for sentence in sentences]
                    words = words[0]
                    print(words)

                    tagged_words = [nltk.pos_tag(sentence) for sentence in words]
                    print(f"tags: {tagged_words}")
                    proper_nouns = []
                    for sentence in tagged_words:
                        for word, tag in sentence:
                            if tag == 'NNP': # NNP denotes proper noun
                                proper_nouns.append(word)
                    print(proper_nouns)
                    propNounsElse += len(set(proper_nouns))
            except:
                pass

    print(f"Prop nouns fake {propNounsFake/fakeTotal} else: {propNounsElse/elseTotal}")

In [8]:
def uniqueGraph(df):
    fakeArticles = []
    reliableArticles = []

    fakeWords = []
    reliableWords = []

    #for y in df["content"]:
    for x in range(0, len(df)):
        if df["type"][x].lower() == "fake":
            fakeWords = Counter(sorted(word_tokenize(df["content"][x])))
            fakeArticles.append(len(fakeWords))
        else:
            reliableWords = Counter(sorted(word_tokenize(df["content"][x])))
            reliableArticles.append(len(reliableWords))

    AvFake = sum(fakeArticles)/len(fakeArticles)
    AvReliable = sum(reliableArticles)/len(reliableArticles)
    dif = (AvFake-AvReliable)/AvFake*100

    print("Unique words in fake articles: " + str(AvFake))
    print("Unique words in reliable articles: " + str(AvReliable))
    print("Difference: {} %".format(math.floor(dif)))

In [9]:
def fakenessFromWord(df, word):

    fakeWord = 0
    reliableWord = 0

    fakeNoWord = 0
    reliableNoWord = 0

    word = word.lower()


    for x in range(0, len(df)):
        result = re.search(rf"\b{word}\b", str(df["content"][x]))
        try:
            if not(result == None): #the word is found
                if df["type"][x].lower() == "fake":
                    fakeWord += 1
                else:
                    reliableWord += 1
            else:
                if df["type"][x].lower() == "fake":
                    fakeNoWord += 1
                else:
                    reliableNoWord += 1
        except:
            pass
    print("fakeword: {}\n reliableword: {}\n fakenoword: {} \n reliableNoword: {}".format(fakeWord, reliableWord, fakeNoWord, reliableNoWord))
    print(df.shape)
    #percentage of fake articles with the word out of all fake articles
    preFake = (1 - (fakeNoWord/(fakeNoWord + fakeWord)))*100
    print("Percentage of fake articles with the word: {}%".format(preFake))

    #percentage of reliable articles with the word out of all reliabel articles
    preReliable = (1 - (reliableNoWord/(reliableNoWord + reliableWord)))*100
    print("Percentage of reliable articles with the word: {}%".format(preReliable))

    #out of all articles with the word X% of them are fake
    fakeWordCorrelation = (fakeWord/(fakeWord + reliableWord))*100
    print(fakeWordCorrelation)

In [10]:
def exclamationFunction(df):

    fakeExclamations = []
    nonFakeExclamations = []

    fakeNoExclamations = 0
    nonFakeNoExclamations = 0
    k = 0

    for x in range(0, len(df)):
        if str(df['type'][x]).lower() == 'fake':
            # counting number of fake articles
            k += 1
        excl = re.findall('!', df['content'][x])
        if len(excl) == 0: 
            if str(df['type'][x]).lower() == 'fake':
                fakeNoExclamations += 1
            else:
                nonFakeNoExclamations += 1
        else:
            if str(df['type'][x]).lower() == 'fake':
                fakeExclamations.append(len(excl))
            else:
                nonFakeExclamations.append(len(excl))

    # mean number of exclamation marks in fake and not fake articles
    fakeExclMean = stats.mean(fakeExclamations)
    nonFakeExclMean = stats.mean(nonFakeExclamations)

    # percentage of fake articles without exclamations
    fNE = (fakeNoExclamations / k)*100

    print("If the article is fake and has exclamation marks, there are on average {} of them".format(fakeExclMean))
    print("If the article isn't fake, and has exclamation marks, there are on average {} of them".format(nonFakeExclMean))

    print("Of the {} total articles, {} of them are fake".format(len(df), k))
    print("Of the {} fake articles, {} don't have exclamation marks in - {}%".format(k, fakeNoExclamations, fNE))

    # if fake, percentage chance of exclamation
    ifFake = ((k/len(df)) * (len(fakeExclamations)/k))/(
                (k/len(df)) * (len(fakeExclamations)/k) + (
                (1 - (k/len(df))) * (1 - (len(fakeExclamations)/k)))) * 100

    # if not fake, perecentage chance of exclamation
    ifNonFake = (((len(df) - k)/len(df)) * (len(nonFakeExclamations)/(
                len(df) - k))) / ((((len(df) - k)/len(df)) * (
                len(nonFakeExclamations)/(len(df) - k))) + ((1 - (
                (len(df) - k)/len(df))) * (1 - (len(nonFakeExclamations)/(
                len(df) - k))))) * 100

    print("If an article is fake, there is a {}% chance that it has exclamation marks".format(ifFake))
    print("If an article isn't fake, there is a {}% chance that it has exclamation marks".format(ifNonFake))


In [11]:
# changing types to a binary classification
# Reliable is either with type reliable, clickbait or political - becomes 1
# Fake is the rest - becomes 0

def classifierRelOrFake(df):
    for x in range(0, len(df)):
        if df['type'][x] == 'reliable' or df['type'][x] == 'clickbait' or df['type'][x] == 'political':
            df['type'][x] = 1 
        else:
            df['type'][x] = 0 
    return df

In [12]:
# for x in range(len(df)):
#     print("Authors:",df['authors'][x])



In [13]:
# for x in range(0,len(df)):
#     df.iloc[x] = zipfsLaw.zipfsFiltering(df.iloc[x])
#     df.iloc[x] = removeStopwords(df.iloc[x])
#     df.iloc[x] = keywordFiltering(df.iloc[x])
#     df.iloc[x] = stemming.applyStemming(df.iloc[x])

    #tilføj funktioner husk kun at give en linje
# fakenessFromWord(df, "trump")
exclamationFunction(df)
classifierRelOrFake(df)
# uniqueGraph(df)

# exploringData(df)
# print(df)



X_train, X_test, y_train, y_test = train_test_split(df['content'], df['type'],
                                                    test_size=0.2,random_state=0)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.5,
                                                random_state=0)
# lr = LinearRegression()
# model = lr.fit(X_train, y_train)
# print(model.score(X_train, y_train))
# y_pred = model.predict(X_test)

df.to_csv("Results.csv")



If the article is fake and has exclamation marks, there are on average 3.114503816793893 of them
If the article isn't fake, and has exclamation marks, there are on average 2.388888888888889 of them
Of the 239 total articles, 146 of them are fake
Of the 146 fake articles, 15 don't have exclamation marks in - 10.273972602739725%
If an article is fake, there is a 93.20208566833975% chance that it has exclamation marks
If an article isn't fake, there is a 13.260456273764259% chance that it has exclamation marks


In [17]:
from sklearn.naive_bayes import MultinomialNB

def naive_bayes(df):

    x = df['content'].to_numpy()
    y = df['type'].to_numpy()

    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=0)
    multiNB = MultinomialNB()
    multiNB.fit(x_train, y_train)

    y_pred = multiNB.predict(x_val)

    nb_accuracy = metrics.accuracy_score(y_val, y_pred)
    return nb_accuracy


test = naive_bayes(df)
print(test)

MultinomialNB()


ValueError: could not convert string to float: 'Shopping: Yule Gifts for Witches and Wizards\r\n\r\n% of readers think this story is Fact. Add your two cents.\r\n\r\nHeadline: Bitcoin & Blockchain Searches Exceed Trump! Blockchain Stocks Are Next!\r\n\r\nWhen I was in Treadwell’s occult bookshop in London earlier this week I photographed the display of things that would really make wonderful gifts for anyone interested in witchcraft and magic.\r\n\r\nI was particularly taken with the shops exclusive gift boxes for witches and magicians. You can see the Witch’s Gift Box to the left and in the picture above, nestled in among a whole range of books that would also be great witchy gifts. (And yes, I was pretty chuffed to see my own book, Pagan Portals – Candle Magic, in the front row.)\r\n\r\nBut here is what Treadwell’s has to say about the Witch’s Gift Box:\r\n\r\nFor that special Witch in your life we have the perfect gift, a box full of delights. We have selected what we think are all the things a Witch needs, perhaps most important of all is a great big mug for cups of tea while they contemplate the mysteries of the universe.\r\n\r\nThe Witch’s Gift Box includes: A pendulum, Rider Waite tarot deck, a small crystal ball, a hare pentacle, a witch doll, a bag of cone incense, a book on herbal healing, 4 quarter candles, a Witch’s scroll and Witch’s brew mug. All carefully packed in a beautiful box. Price: £50.\r\n\r\nMind you, although that really appealed to me, I do already have everything it contains, so I’m not personally dropping hints. However, if anyone was wanting to spend a bit of money on me for Yule, I don’t actually own either of the main items in Treadwell’s Crowley Magick Box: The Book of the Law and the Thoth Tarot.\r\n\r\n\r\n\r\nHere’s the description in full:\r\n\r\nCrowley Magick Box: A box to start you on your way with the mysteries of Magick. The essential Book of the Law, with black and while pillar candles, incense, oil, ink, the beautiful Thoth tarot deck, a Baphomet scroll and mug. All packed in a beautiful box, perfect to store your growing collection of magical tools. Price: £50.\r\n\r\nTreadwell’s Bookshop is at 33 Store Street, London, WC1E 7BS. Tel: 020 7419 8507. Email: info@treadwells-london.com.\r\n\r\nLinks\r\n\r\nhttps://www.treadwells-london.com/\r\n\r\nPagan Portals – Candle Magic\r\n\r\nWitch’s Gift Box\r\n\r\nCrowley Magick Box\r\n\r\nTo read more posts like this visit A Bad Witch’s Blog at www.badwitch.co.uk\r\n\r\nSource: http://www.badwitch.co.uk/2017/12/shopping-yule-gifts-for-witches-and.html'