In [3]:
# required pip installs: numpy, pandas, matplotlib, nltk, spacy, textblob, pickle, tqdm, sklearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy
from textblob import Word
nlp = spacy.load('en_core_web_sm')

from os import listdir
import re
import pickle

from tqdm.notebook import trange, tqdm


from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# nltk.download('stopwords')

In [43]:
import datetime

In [4]:
# The file structure should have the "news" folder containing the 3 .csv's in the same root folder as this noteboo

# Load the data from the disk into a pandas dataframe

dfs = []
for x in [1, 2, 3]:
    for chunk in pd.read_csv(f'./news/articles{x}.csv', chunksize = 100):
        dfs.append(chunk)
df = pd.concat(dfs, ignore_index=True)

del dfs

In [7]:
# Run to define function that converts articles to lowercase and perfom lemmatization

def preprocess(row):
    
    row = row.lower()
    
    # lemmatize
    row = ' '.join([Word(word).lemmatize() for word in row.split()])
    
    # remove any extra whitespace
    row = re.sub('\s{1,}', ' ', row)
    
    return ' '.join([word for word in row.split() if len(word) > 2])

In [13]:
# Run to apply the preprocessing to the articles if 'processed_articles.csv' does not already exist (will take a while)
# If it already exists, skip to the cell below

df_processed = df.copy(deep = True)

for i in trange(df.shape[0]):
    df_processed.loc[i, 'content'] = preprocess(df_processed.loc[i, 'content'])

  0%|          | 0/142570 [00:00<?, ?it/s]

In [5]:
# Run to load the preprocessed articles dataframe directly from the disk. The 'processed_articles.csv' should be
# placed within the same root directory as this notebook file

dfs = []
for chunk in pd.read_csv('processed_articles.csv', chunksize = 100):
    dfs.append(chunk)
df_processed = pd.concat(dfs, ignore_index=True)

del dfs

In [16]:
# Run to save processed dataframe created in the cell above

df_processed.to_csv('processed_articles.csv')

In [21]:
# Run to remove any weird blank articles

df_processed = df_processed[pd.isna(df_processed['title']) == False]
df_processed = df_processed[pd.isna(df_processed['content']) == False]
df = df[pd.isna(df['title']) == False]
df = df[pd.isna(df['content']) == False]

In [84]:
# Run to make date formats uniform

df['date'] = pd.to_datetime(df['date'],format='%Y-%m-%d')
df_processed['date'] = pd.to_datetime(df_processed['date'],format='%Y-%m-%d')

In [36]:
# Run to create the sklearn tfidf model if the 'model.pkl' file does not already exist
# If it does, load it from the cell below

content = df_processed['content']
model = TfidfVectorizer(stop_words = 'english', use_idf = True, smooth_idf = True, min_df=1)

In [38]:
# Load the sklearn tfidf model from the disk. The file should be in the same root directory as this notebook file

model = pickle.load(open('model.pkl', 'rb'))

In [23]:
# Run to create the tf-idf matrix corresponding to the articles content dataframe  if the 'tfidf.pkl' file does not already exist
# If it does, load the tfidf matrix from the cell below

tfidf = model.fit_transform(content)

In [39]:
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [28]:
# Save the model and tf-idf matrix

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [76]:
# Define the function that calculates similarity by applying the dot product between the transformed search query
# and the transpose of the tf-idf matrix

def search(query, model, tfidf, qty, q_type):
    query_transform = model.transform([query])
    similarity = np.dot(query_transform, np.transpose(tfidf))
    x = np.array(similarity.toarray()[0])
    print(np.argsort(x))
    return np.argsort(x)[-qty-1:][::-1][1:] if q_type == 'id' else np.argsort(x)[-qty:][::-1]

In [134]:
# Run to input an article's ID and a number and get the original article and that number of related articles in a
# 'results.txt' file, sorted asceding by date

# examples: 
# 69134 for stories regarding the dispute between Apple and the US Department of Justice over providing a backdoor into a terrorist's iPhone
# 12346 for stories related to a comedian posting an unsettling edited picture of President Trump
# 31471 for stories relating to the Confederate flag
# 134713 for stories relating to the 2017 London Bridge attack
# 127683 for stories regarding a legal fight between Uber and Google

o_id = int(input())
qty = input()
ids = search(df_processed.loc[o_id, 'content'], model, tfidf, int(qty), 'id')
print(ids)
locs = []


with open('results.txt', 'w', encoding='utf-8') as f:
    f.write('ORIGINAL ARTICLE: \n{} - {} \n\n {}\n\n'.format(df.loc[o_id, 'title'], df.loc[o_id, 'date'], df.loc[o_id, 'content']))
    for i in ids:
        loc = i
        max_sim = 0
        for j in [i, i-1, i+1]: # checking similarities here again because of some weird indexing issue, reliable fix
            vectorizer = TfidfVectorizer(stop_words='english', min_df=1)
            temp = vectorizer.fit_transform([df_processed.loc[j, 'content'], df_processed.loc[o_id, 'content']])
            sim = (np.dot(temp, temp.T).A)[0][1]
            if sim > max_sim:
                max_sim = sim
                loc = j
            
            #if any([word in df_processed.loc[j, 'content'] for word in query.split()]):
                #loc = j
                #break
        locs.append(loc)
        
    locs = sorted(locs, key=lambda id: datetime.datetime.strptime(str(df.loc[id,'date']) if str(df.loc[id,'date']) != 'NaT' else '1970-01-01 00:00:00', '%Y-%m-%d %H:%M:%S'))
    print(locs)
    
    for loc in locs:
        f.write('{} - {} - {} \n\n {}\n\n'.format(df.loc[loc, 'title'], df.loc[loc, 'publication'], df.loc[loc, 'date'], df.loc[loc, 'content']))
        
print("Done! Check results.txt")

69134
3
[ 72731 113531  48880 ...  69472  69306  69133]
[69306 69472 58032]
[58032, 69307, 69473]
Done! Check results.txt


In [None]:
df.loc[127683, 'content']

In [110]:
# Run to input a query and a number and get that number articles related to the query in a 'results.txt' file 
# sorted asceding by date

# examples: "israel embassy", "alabama governor"

query = input()
qty = input()
ids = search(query, model, tfidf, int(qty), 'query')
print('ids: ', ids)
locs = []

with open('results.txt', 'w', encoding='utf-8') as f:
    for i in ids:
        loc = i
        max_sim = 0
        for j in [i, i-1, i+1]: # checking similarities here again because of some weird indexing issue, reliable fix
            vectorizer = TfidfVectorizer(stop_words='english', min_df=1)
            temp = vectorizer.fit_transform([df_processed.loc[j, 'content'], query])
            sim = (np.dot(temp, temp.T).A)[0][1]
            if sim > max_sim:
                max_sim = sim
                loc = j
        locs.append(loc)
        
    locs = sorted(locs, key=lambda id: datetime.datetime.strptime(str(df.loc[id,'date']) if str(df.loc[id,'date']) != 'NaT' else '1970-01-01 00:00:00', '%Y-%m-%d %H:%M:%S'))
        
    for loc in locs:
        print(loc)
        f.write('{} - {} - {} \n\n {} {}\n\n'.format(df.loc[loc, 'title'], df.loc[loc, 'publication'], df.loc[loc, 'date'], df.loc[loc, 'content'], loc))
        
print("Done! Check results.txt")

ford super duty pickup
3
[     0  93154  93153 ... 125888  49018  48186]
ids:  [ 48186  49018 125888]
48186
49018
125888
Done! Check results.txt
