In [32]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import string
import re
import os
import nltk
import numpy.linalg as LA
import pickle

# Importing Data and Cleaning

In [2]:
data = pd.read_csv("Articles_textv5.csv", nrows=1000)
# take away nrows = 1000 on AWS

In [3]:
print(data["body_text"].head(5))

0                                                  NaN
1    Variety Print Plus SubscriberVariety Print Plu...
2    Rigorous nonprofit news for Vermont.Today's Bu...
3    VAUXHALL, NJ – A Vauxhall restaurant owner has...
4    Coss MarteHere are two issues Coss Marte start...
Name: body_text, dtype: object


In [21]:
#drop null values
data = data.dropna(axis=0).reset_index()

In [22]:
#check for null
data.isnull().values.any()

False

In [23]:
data["body_text"].head(5)

0    variety print plus subscribervariety print plu...
1    rigorous nonprofit news for vermont.today's bu...
2    vauxhall, nj – a vauxhall restaurant owner has...
3    coss martehere are two issues coss marte start...
4    gettythe covid-19 global pandemic has upended ...
Name: body_text, dtype: object

In [24]:
data["combined_body_text"] = data.filter(regex=("body_text")).apply(lambda x: ''.join(str(x.values)), axis=1)

data = data.applymap(lambda s:s.lower() if type(s) == str else s)

data["combined_body_text"].head(5)

0    ['variety print plus subscribervariety print p...
1    ["rigorous nonprofit news for vermont.today's ...
2    ['vauxhall, nj – a vauxhall restaurant owner h...
3    ["coss martehere are two issues coss marte sta...
4    ['gettythe covid-19 global pandemic has upende...
Name: combined_body_text, dtype: object

In [25]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_â€˜™]')
data["combined_body_text"] = [REPLACE_BY_SPACE_RE.sub('',row) for row in data["combined_body_text"]]
data["combined_body_text"] = [BAD_SYMBOLS_RE.sub('',row) for row in data["combined_body_text"]]

In [26]:
data["combined_body_text"].head(5)

0    variety print plus subscribervariety print plu...
1    rigorous nonprofit news for vermonttodays busi...
2    vauxhall nj  a vauxhall restaurant owner has b...
3    coss martehere are two issues coss marte start...
4    gettythe covid19 global pandemic has upended t...
Name: combined_body_text, dtype: object

# Preprocessing

In [27]:
data["bag_of_words"] = data["combined_body_text"]
print(data["bag_of_words"].head())

0    variety print plus subscribervariety print plu...
1    rigorous nonprofit news for vermonttodays busi...
2    vauxhall nj  a vauxhall restaurant owner has b...
3    coss martehere are two issues coss marte start...
4    gettythe covid19 global pandemic has upended t...
Name: bag_of_words, dtype: object


In [28]:
#strip punctuation
# This uses str.translate to map all punctuation to the empty string
table = str.maketrans('', '', string.punctuation)
data["bag_of_words"] = [row.translate(table) for row in data["bag_of_words"]]
#print(data["bag_of_words"].head())

In [29]:
# Convert all numbers in the article to the word 'num' using regular expressions
data["bag_of_words"] = [re.sub(r'\d+', 'num', row) for row in data["bag_of_words"]]
#print(data["bag_of_words"] )

In [33]:
#stopwords
stopwords = set(stopwords.words('english'))
data["bag_of_words"] = [[word.lower() for word in row.split() if word.lower() not in stopwords] for row in data["bag_of_words"]]
print(data["bag_of_words"].head())

0    [variety, print, plus, subscribervariety, prin...
1    [rigorous, nonprofit, news, vermonttodays, bus...
2    [vauxhall, nj, vauxhall, restaurant, owner, na...
3    [coss, martehere, two, issues, coss, marte, st...
4    [gettythe, covidnum, global, pandemic, upended...
Name: bag_of_words, dtype: object


In [34]:
#stemming (A better option would be to lemmatize, but it takes forever)
stemmer = PorterStemmer()
data["bag_of_words"] = [" ".join([stemmer.stem(word) for word in row]) for row in data["bag_of_words"]]

# SVM

In [15]:
clf2 = pickle.load(open("clf", 'rb'))
vectorizer = pickle.load(open("vectorizer", "rb"))

In [35]:
result = clf2.predict(vectorizer.transform(data["bag_of_words"]))

In [36]:
print(data['Title'])

0      ‘homeland’ producers talk alternate endings, c...
1      new program helps small vermont businesses ada...
2      owner of vauxhall takeout bbq restaurant named...
3      second chance studios: turning ex-offenders in...
4      7 ways to make your online virtual conference ...
                             ...                        
987    cemex-tec recognizes projects that seek to tra...
988    bangladesh threatens to ban these uk brands, e...
989    webb fontaine launches new webinar series to h...
990    5 ways to reduce customer churn for saas in a ...
991    we are rethinking businesses, why can’t we ret...
Name: Title, Length: 992, dtype: object


In [38]:
for i in range(len(data['Title'])):
    if result[i] == 1.0:
        print(f"{i} {data['Title'][i]}: {result[i]}")

10 post covid-19, the answer is digital transformation, now what’s the question?: 1.0
30 investing in the end-to-end value that 3d printing creates, not in the technology itself: 1.0
41 best practices for managing remote customer support teams: 1.0
42 real-time customer engagement and omnichannel personalization deliver a superior cx: 1.0
46 lucro deploys small business lending app to speed covid-19 relief loans: 1.0
49 2020 is the tech awakening cre has been waiting for - bisnow: 1.0
53 life after the lockdown: it spending priorities now: 1.0
61 gotcha mobility rolls out fleet of all-new e-bikes: 1.0
65 vmblog expert interview: simon crosby talks continuous intelligence and swim continuum 4.0: 1.0
66 robotic process automation leader uipath raises $225m in late-stage round: 1.0
68 5 ways b2b ecommerce improves organizational efficiency: 1.0
123 gigsmart get gigs aggregates all open work opportunities into one app: 1.0
147 xilinx and spline.ai use ai to enable covid-19 detection at the

# Save Datafram

In [45]:
columns = ["Title", "Relevant"]
relevant_articles = [[data["Title"][i], result[i]] for i in range(len(data["Title"])) if result[i] == 1.0]
relevant_articles = pd.DataFrame(relevant_articles, columns = columns)
relevant_articles.head()

Unnamed: 0,Title,Relevant
0,"post covid-19, the answer is digital transform...",1.0
1,investing in the end-to-end value that 3d prin...,1.0
2,best practices for managing remote customer su...,1.0
3,real-time customer engagement and omnichannel ...,1.0
4,lucro deploys small business lending app to sp...,1.0
