In [1]:
import pandas
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import string
import re
import os
import nltk
import numpy.linalg as LA

In [2]:
data = pandas.read_csv("covid_articles_2.csv")

In [3]:
data["body_text"].head(5)

0    \ngofundmeA college senior from Versailles, Wo...
1    Baltimore-based company has developed CDC-alig...
2    The need for innovative solutions to address s...
3    An award-winning team of journalists, designer...
4                                                  NaN
Name: body_text, dtype: object

In [4]:
#drop null values
data = data.dropna(axis=0)

In [5]:
#check for null
data.isnull().values.any()

False

In [6]:
data["body_text"].head(5)

0    \ngofundmeA college senior from Versailles, Wo...
1    Baltimore-based company has developed CDC-alig...
2    The need for innovative solutions to address s...
3    An award-winning team of journalists, designer...
5    Want to discuss? Please read our Commenting Po...
Name: body_text, dtype: object

In [7]:
data["combined_body_text"] = data.filter(regex=("body_text")).apply(lambda x: ''.join(str(x.values)), axis=1)


In [9]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
data["combined_body_text"] = [REPLACE_BY_SPACE_RE.sub('',row) for row in data["combined_body_text"]]
data["combined_body_text"] = [BAD_SYMBOLS_RE.sub('',row) for row in data["combined_body_text"]]

In [10]:
data["combined_body_text"].head(5)

0    ngofundme college senior from ersailles oodfor...
1    altimorebased company has developed aligned so...
2    he need for innovative solutions to address sh...
3    n awardwinning team of journalists designers a...
5    ant to discuss lease read our ommenting olicy ...
Name: combined_body_text, dtype: object

Preprocessing starts here

In [11]:
#Preprocessing
data["bag_of_words"] = data["combined_body_text"]
print(data["bag_of_words"].head())

0    ngofundme college senior from ersailles oodfor...
1    altimorebased company has developed aligned so...
2    he need for innovative solutions to address sh...
3    n awardwinning team of journalists designers a...
5    ant to discuss lease read our ommenting olicy ...
Name: bag_of_words, dtype: object


In [12]:
#strip punctuation
# This uses str.translate to map all punctuation to the empty string
table = str.maketrans('', '', string.punctuation)
data["bag_of_words"] = [row.translate(table) for row in data["bag_of_words"]]
#print(data["bag_of_words"].head())

In [13]:
# Convert all numbers in the article to the word 'num' using regular expressions
data["bag_of_words"] = [re.sub(r'\d+', 'num', row) for row in data["bag_of_words"]]
#print(data["bag_of_words"] )

In [14]:
#stopwords
stopwords = set(stopwords.words('english'))
data["bag_of_words"] = [[word.lower() for word in row.split() if word.lower() not in stopwords] for row in data["bag_of_words"]]
print(data["bag_of_words"].head())

0    [ngofundme, college, senior, ersailles, oodfor...
1    [altimorebased, company, developed, aligned, s...
2    [need, innovative, solutions, address, shortag...
3    [n, awardwinning, team, journalists, designers...
5    [ant, discuss, lease, read, ommenting, olicy, ...
Name: bag_of_words, dtype: object


In [15]:
#stemming (A better option would be to lemmatize, but it takes forever)
stemmer = PorterStemmer()
data["bag_of_words"] = [" ".join([stemmer.stem(word) for word in row]) for row in data["bag_of_words"]]

In [16]:
#Vectorizing dataset
vectorizer = TfidfVectorizer(tokenizer = word_tokenize, analyzer = "word", ngram_range=(1, 2))
X = vectorizer.fit_transform(data["bag_of_words"].values)
#print(data["X"].head())

# SVM

In [17]:
#Set random seed
Y = data["Relevant"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X_train)
print(Y_train.values)

  (0, 65531)	0.037770432631338356
  (0, 45730)	0.037770432631338356
  (0, 2015)	0.037770432631338356
  (0, 69895)	0.037770432631338356
  (0, 14370)	0.037770432631338356
  (0, 22909)	0.037770432631338356
  (0, 18850)	0.037770432631338356
  (0, 7369)	0.037770432631338356
  (0, 4348)	0.037770432631338356
  (0, 30773)	0.037770432631338356
  (0, 26593)	0.037770432631338356
  (0, 56485)	0.037770432631338356
  (0, 10836)	0.037770432631338356
  (0, 9101)	0.037770432631338356
  (0, 4717)	0.037770432631338356
  (0, 75953)	0.037770432631338356
  (0, 73851)	0.037770432631338356
  (0, 85402)	0.03493147871573923
  (0, 45532)	0.037770432631338356
  (0, 44613)	0.037770432631338356
  (0, 68464)	0.037770432631338356
  (0, 70021)	0.037770432631338356
  (0, 75020)	0.037770432631338356
  (0, 17976)	0.037770432631338356
  (0, 23050)	0.037770432631338356
  :	:
  (127, 11410)	0.030031047964142746
  (127, 55932)	0.028630336683598826
  (127, 51088)	0.017848504829776263
  (127, 66339)	0.019368275853841287
  (127

In [18]:
print(np.unique(Y_train))

[0. 1.]


In [19]:
# train classifier
# DOESNT WORK RN COZ WE ONLY GOT 1 CLASS: RELEVANT SHOULD BE 1 AND 0
clf = SVC(probability=True, kernel='linear')
clf.fit(X_train, Y_train)

SVC(kernel='linear', probability=True)

In [23]:
predictions = clf.predict_proba(X_test)
print('ROC-AUC yields ' + str(roc_auc_score(Y_test, predictions[:,1])))

ROC-AUC yields 0.8897058823529411


In [24]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {"kernel": ["rbf", "poly", "linear"], "gamma": ["scale", "auto"], "degree": [2, 3]}

gs_clf_svm = GridSearchCV(clf, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, Y_train)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

0.7895384615384614
{'degree': 2, 'gamma': 'scale', 'kernel': 'linear'}
