In [1]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import nltk
import spacy
from time import time
from tqdm import tqdm
from pprint import pprint
import pandas as pd
import random
import pickle
import os

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def chargrams(hl, body, size):
    chargram_hits = 0
    for i in range(len(hl.text)-size+1):
        chgram = body.text[i:i+size]
        if chgram in body.text:
            #print(chgram)
            chargram_hits += 1
    return chargram_hits

def ngrams(hl, body, size):
    ngram_hits = 0
    for i in range(len(hl.doc)-size+1):
        skip_flag = False #Do not make ngrams if punctuations occur
        for k in range(i, i+size): 
            if hl.doc[k].is_punct:
                skip_flag = True
        if skip_flag:
            continue
        ngram = hl.doc[i:i+size]
        if ngram.text in body.text:
            #print(ngram)
            ngram_hits += 1
    return ngram_hits

def clean(text):
    return ''.join(x.lower_ for x in text)

def jaccard_sim(hline, body):
    hset = set(clean(hline))
    bset = set(clean(body))
    if len(bset) == 0.0: return 0.0
    else:
        return len(hset.intersection(bset))/len(hset.union(bset))

def make_features(X):
    fvecs = []
    for i, h, b in tqdm(X):
        fvec = []
        fvec.append(jaccard_sim(h, b))
        fvec.append(ngrams(h, b, 2))
        fvec.append(ngrams(h, b, 3))
        fvec.append(ngrams(h, b, 4))
        fvec.append(ngrams(h, b, 5))
        fvec.append(chargrams(h, b, 4))
        fvec.append(chargrams(h, b, 8))
        fvec.append(chargrams(h, b, 16))
        fvec.append(chargrams(h, b, 32))
        assert len(fvec) == 9
        fvecs.append(fvec)
    return np.array(fvecs)

In [4]:
headline = nlp('Lorem ipsum dolor sit amet')
body = nlp('Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Aenean commodo ligula eget dolor.')
body2 = nlp('Lorem ipsum amidolor sit amet, consetuer adiping elit. eget dolor.')