In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import vaderSentiment
import spacy
from spacy import displacy
from textblob import Word
nlp = spacy.load('en_core_web_sm')

from os import listdir
import re
import pickle

from tqdm.notebook import trange, tqdm
from IPython.display import display, clear_output


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [12]:
# get the articles into one dataframe

dfs = []
for x in [1, 2, 3]:
    for chunk in pd.read_csv(f'./news/articles{x}.csv', chunksize = 100):
        dfs.append(chunk)
df = pd.concat(dfs, ignore_index=True)

In [7]:
# lemmatize and lowercase articles

def preprocess(row):
    from string import punctuation
    
    row = row.lower()
    
    punctuation = punctuation + '0123456789'
    # punctuation
    row = ' '.join([word for word in row.split() if word not in punctuation])
    
    # lemmatize
    row = ' '.join([Word(word).lemmatize() for word in row.split()])
    
    # remove any extra whitespace
    row = re.sub('\s{1,}', ' ', row)
    
    return ' '.join([word for word in row.split() if len(word) > 2])

In [13]:
df_processed = df.copy(deep = True)

for i in trange(df.shape[0]):
    df_processed.loc[i, 'content'] = preprocess(df_processed.loc[i, 'content'])

  0%|          | 0/142570 [00:00<?, ?it/s]

In [15]:
df_processed.loc[3, 'content']

'death may the great equalizer, but isn’t necessarily evenhanded. all the field endeavor that suffered mortal loss 2016 consider muhammad ali and arnold palmer sport and the hollywood death carrie fisher and debbie reynolds the pop music world had, hand down, the bleakest year. start with david bowie, whose stage persona androgynous glam rocker, dance pop star, electronic experimentalist his music. the year only day old when the news came that had died cancer 69. had hinted that his time short the lyric his final album, released just two day before his death, but had otherwise gone great length hide his illness from the public, wish for privacy that ensured that his death would appear have come out the blue. then came another shock, about three month later, when prince accidentally overdosed painkiller and collapsed elevator his sprawling home studio near minneapolis. death came him 57, and all indication one, including prince rogers nelson, had seen coming. energetic onstage ever, hol

In [16]:
# save processed dataframe if needed

df_processed.to_csv('processed_articles.csv')

In [18]:
# remove any weird blank articles

df_processed = df_processed[pd.isna(df_processed['title']) == False]
df_processed = df_processed[pd.isna(df_processed['content']) == False]
df = df[pd.isna(df['title']) == False]
df = df[pd.isna(df['content']) == False]

In [22]:
# sklearn tfidf model

content = df_processed['content']
model = TfidfVectorizer(max_df = 0.25, stop_words = 'english', use_idf = True, smooth_idf = True)

In [23]:
# transform content into tf idf matrix

tfidf = model.fit_transform(content)

In [28]:
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [112]:
def search(query, model, tfidf, qty):
    query_transform = model.transform([query])
    similarity = np.dot(query_transform, np.transpose(tfidf))
    x = np.array(similarity.toarray()[0])
    print(np.argsort(x))
    return np.argsort(x)[-qty-1:][::-1][1:]

In [113]:
query = df.loc[1245, 'content']
search(query, model, tfidf, 3)

[ 8185 62629 52757 ...  2447  2398  1245]


array([ 2398,  2447, 50476], dtype=int64)

In [114]:
df.loc[1245, 'content']

'MONTGOMERY, Ala.  —   When Luther Strange ran for attorney general in this   state in 2010, he appeared in an advertisement that spoke darkly of the Alabama capital’s “corruption, grand jury investigations, insider deals, abusing the public trust. ” Mr. Strange won that year’s general election easily, and then another one in 2014. But since ascending to the United States Senate this month, he has found his popularity threatened and his fellow Republicans troubled, largely because he accepted the appointment of Gov. Robert J. Bentley, a subject of an active investigation that the new senator spent months overseeing. A startling number of people in and around the State House openly suspect, but lack evidence to prove, that part of Mr. Bentley’s reason for appointing Mr. Strange to the Senate was to try to undermine the inquiry. Beyond clouding Mr. Strange’s early days in the Senate, the appointment to fill the seat of Jeff Sessions, President Trump’s new attorney general, has exacerbate

In [118]:
df.loc[50476, 'content']

'Governor Robert Bentley on Thursday named Strange to take Sessions’s place in the Senate until a special election is held next year. Normally, the selection of a state’s attorney general to temporarily assume a Senate seat would not generate much controversy. It’s a common stepping stone in politics —  just ask Sessions, the nation’s new attorney general who served in the same office in Alabama before he first won his Senate seat in 1996. But Bentley’s decision has come under immediate scrutiny because Strange, as attorney general, might be investigating the   governor for impropriety related to an affair he had with a top political adviser that became public last year. Strange will neither confirm nor deny whether a probe is taking place. “We have never said in our office that we are investigating the governor,” the attorney    told reporters in Montgomery moments after Bentley, who stood nearby, announced his appointment. “And I think it’s somewhat actually unfair to him and unfair 