In [78]:
import spacy
import re
import numpy as np
import pandas as pd
from collections import Counter
import nltk
import gensim
import time
#sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, recall_score, precision_score, classification_report
from sklearn.externals import joblib
from lib.processor import *

from stanfordcorenlp import StanfordCoreNLP
from nltk import Tree

In [16]:
nlp = spacy.load('en_core_web_sm')
nlp2 = StanfordCoreNLP('../../../Downloads/stanford-corenlp-full-2018-10-05/stanford-corenlp-full-2018-10-05')

dataset

In [65]:
df = pd.read_csv("dataset.csv", sep = ",")
df_small = df.head()

# final feature extractor function

models

In [37]:
wv = gensim.models.KeyedVectors.load_word2vec_format(os.path.join("models","GoogleNews-vectors-negative300.bin.gz"), binary=True)
wv.init_sims(replace=True)

tfidf_logreg_model = joblib.load("models/tfidf/tfidf_logreg_model.pkl")
tfidf_nb_model = joblib.load("models/tfidf/tfidf_nb_model.pkl")
w2v_logreg_model = joblib.load("models/w2v/w2v_logreg_model.pkl")
w2v_nb_model = joblib.load("models/w2v/w2v_nb_model.pkl")
w2v_rforest_model = joblib.load("models/w2v/w2v_rforest_model.pkl")
w2v_svc_model = joblib.load("models/w2v/w2v_svc_model.pkl")



PCFG functions

In [40]:
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return np.array(sentences)

def adj(x):
    x = expand_contractions(remove_accented_chars(x))
    if ("JJ" in x):
        if (("VP" in x) | ("NP" in x)):
            if (("ADVP" not in x) & ("WHAVP" not in x) & ("WHNP" not in x)):
                return True
            
def adv(x):
    x = expand_contractions(remove_accented_chars(x))
    if ("RB" in x):
        if (("VP" in x) | ("NP" in x)):
            if (("ADVP" not in x) & ("WHAVP" not in x) & ("WHNP" not in x)):
                return True

POSTag functions

In [14]:
#helper functions
def pre_process(raw_text): #pre-processes the raw text from the dataset
    ex = preprocess_text(raw_text, remove_special = False, stem=False, lemmatize = False, remove_stops = False)
    ex = ex.replace("\n", "")
    ex = ex.replace("\r", "")
    doc = nlp(ex)
    return doc

def get_pos_and_tag(text):
    tag_arr = np.zeros(0)
    pos_arr = np.zeros(0)
    for i in text:
        pos_arr = np.append(pos_arr, i.pos_)
        tag_arr = np.append(tag_arr, i.tag_)        
    return (pos_arr, tag_arr)

def get_features(raw_text): #raw_text is original text   
    text = pre_process(raw_text)
    words = [token.text for token in text if token.is_punct != True]
    word_counter = Counter(([word for word in words]))
    sorted_word_counts = list(sorted(word_counter.values(), reverse = True)) #sorted in descending order
    
    pos_arr, tag_arr = get_pos_and_tag(text)
    pos_counter = Counter(([pos for pos in pos_arr]))
    tag_counter = Counter(([tag for tag in tag_arr]))

    total_count = sum(tag_counter.values()) #same for tag and pos
    
    ###features
    total_word_count = sum(word_counter.values())
    
    if total_word_count == 0:
        avg_word_length = 0
        lexical_diversity = 0
        repetition_top = 0
        repetition_all = 0
    else:
        avg_word_length = sum(len(word) for word in words)/total_word_count
        lexical_diversity = len(word_counter)/total_word_count
        #sum of number of words of top 20 words seen over total number of words
        repetition_top = sum(sorted_word_counts[:20])/total_word_count
        #1/k weighting on sum of word counts over total number of words
        repetition_all = sum(sorted_word_counts[i]/(i+1) for i in range(len(sorted_word_counts)))/total_word_count
    
    if total_count == 0:
        NNP_percent = 0
        NNPS_percent = 0
        noun_percent = 0
        verb_percent = 0
        part_percent = 0
        det_percent = 0
        unknown_or_foreign_percent = 0
    else:
        #tag percents
        NNP_percent = tag_counter.get("NNP", 0)/total_count
        NNPS_percent = tag_counter.get("NNPS", 0)/total_count
        #POS percents
        noun_percent = pos_counter.get("NOUN", 0)/total_count
        verb_percent = pos_counter.get("VERB", 0)/total_count
        part_percent = pos_counter.get("PART", 0)/total_count
        det_percent = pos_counter.get("DET", 0)/total_count
        unknown_or_foreign_percent = pos_counter.get("X", 0)/total_count
        
    return [total_word_count,avg_word_length,lexical_diversity,repetition_top,repetition_all,\
            NNP_percent,NNPS_percent,noun_percent,verb_percent,part_percent,det_percent,unknown_or_foreign_percent]

w2v functions

In [41]:
def w2v_tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text, language='english'):
        for word in nltk.word_tokenize(sent, language='english'):
            if len(word) < 2:
                continue
            tokens.append(word)
    return tokens

def word_averaging(wv, words):
    all_words, mean = set(), []
    
    for word in words:
        if isinstance(word, np.ndarray):
            mean.append(word)
        elif word in wv.vocab:
            mean.append(wv.vectors_norm[wv.vocab[word].index])
            all_words.add(wv.vocab[word].index)

    if not mean:
        return np.zeros(wv.vector_size,)

    mean = gensim.matutils.unitvec(np.array(mean).mean(axis=0)).astype(np.float32)
    return mean

def  word_averaging_list(wv, text_list):
    return np.vstack([word_averaging(wv, text) for text in text_list ])

# MAIN FUNCTION

In [152]:
def feature_extractor(df): #input is original raw df (with columns labeled as text and label)
                            #output is the df of features and label (no text included in output)
    x = df.text
    y = df.label
    
    dct = {} #initialize an empty dictionary

    #PCFG
    df_pcfg = df.drop(['text','label'], axis = 1).copy()
    df_pcfg['total_sentences'] = x.apply(split_into_sentences)
    df_pcfg['total_sentences'] = df_pcfg['total_sentences'].apply(len)
    df_pcfg['num_sentences_with_adj_in_phrase'] = 0
    df_pcfg['num_sentences_with_adv_in_phrase'] = 0
    
    for j in range(len(x)):
        adjcounter = 0
        advcounter = 0
        try:
            for k in (split_into_sentences(df.iloc[j,0])):
                temptree = Tree.fromstring(nlp2.parse(k))
                productions = temptree.productions()
                adjstatus = False
                advstatus = False
                for i in range(len(productions)):
                    if (adj(str(productions[i]))):
                        adjstatus = True
                    if (adv(str(productions[i]))):
                        advstatus = True
                if (adjstatus):
                    adjcounter += 1
                if (advstatus):
                    advcounter += 1

            df_pcfg.iloc[j, 1] = adjcounter
            df_pcfg.iloc[j, 2] = advcounter

        except:
            pass
    
    
    
    dct['total_sentences'] = df_pcfg['total_sentences']
    dct['num_sentences_with_adj_in_phrase'] = df_pcfg['num_sentences_with_adj_in_phrase']
    dct['num_sentences_with_adv_in_phrase'] = df_pcfg['num_sentences_with_adv_in_phrase']
    
    #POStag
    dct['total_word_count'],dct['avg_word_length'],dct['lexical_diversity'],dct['repetition_top'],\
    dct['repetition_all'],dct['NNP_percent'],dct['NNPS_percent'],dct['noun_percent'],\
    dct['verb_percent'],dct['part_percent'],dct['det_percent'],dct['unknown_or_foreign_percent']\
    = zip(*x.apply(get_features))
    
    #tfidf
    dct['tfidf_prob_nb'] = tfidf_nb_model.predict_proba(x)[:,1]
    dct['tfidf_prob_logreg'] = tfidf_logreg_model.predict_proba(x)[:,1]
    
    #w2v
    w2v_text = df.text.map(lambda x: preprocess_text(x, True, True, False, True, True))
    w2v_tokenized = w2v_text.map(w2v_tokenize_text).values
    data_word_average = word_averaging_list(wv, w2v_tokenized)
    x_w2v = pd.DataFrame(data_word_average)
    dct['w2v_prob_nb'] = w2v_nb_model.predict_proba(x_w2v)[:,1]
    dct['w2v_prob_logreg'] = w2v_logreg_model.predict_proba(x_w2v)[:,1]
    dct['w2v_prob_svc'] = w2v_svc_model.predict_proba(x_w2v)[:,1]
    dct['w2v_prob_rforest'] = w2v_rforest_model.predict_proba(x_w2v)[:,1]
    
    dct['is_fake'] = y
   
    return pd.DataFrame(dct)

In [155]:
#test on small (5 entries) data; takes about 15 seconds (thanks to pcfg lol)
feature_extractor(df_small)

Unnamed: 0,total_sentences,num_sentences_with_adj_in_phrase,num_sentences_with_adv_in_phrase,total_word_count,avg_word_length,lexical_diversity,repetition_top,repetition_all,NNP_percent,NNPS_percent,...,part_percent,det_percent,unknown_or_foreign_percent,tfidf_prob_nb,tfidf_prob_logreg,w2v_prob_nb,w2v_prob_logreg,w2v_prob_svc,w2v_prob_rforest,is_fake
0,76,33,19,1549,4.571336,0.371853,0.339574,0.130808,0.111924,0.007312,...,0.028121,0.113611,0.0,0.003171641,0.2068,0.000593814,0.440473,0.349879,0.5,0
1,27,12,8,488,4.534836,0.561475,0.315574,0.108682,0.099822,0.003565,...,0.030303,0.117647,0.0,0.9980483,0.637621,0.999996,0.825876,0.883786,0.6,1
2,20,13,3,443,4.458239,0.577878,0.34763,0.107576,0.099174,0.004132,...,0.008264,0.128099,0.0,0.9972521,0.865423,1.0,0.977274,0.996848,0.9,1
3,3,1,0,61,4.344262,0.836066,0.491803,0.136849,0.169231,0.015385,...,0.030769,0.138462,0.0,6.997447e-10,0.042518,1.646796e-09,0.021398,0.001886,0.0,0
4,50,20,3,505,4.879208,0.534653,0.328713,0.109228,0.053819,0.003472,...,0.008681,0.064236,0.0,1.0,0.937735,0.9999966,0.823087,0.874765,1.0,1
