In [1]:
import sys
sys.path.insert(0, '../../scripts/')

In [2]:
# import required packages
import pandas as pd

# preprocess
from sklearn.feature_extraction.text import TfidfVectorizer

# local scripts
from text_utils import preprocess_corpus

In [3]:
# load dataset into dataframe
df = pd.read_csv('../../data/train_data.csv')

# dataset shape: (rows, columns)
display(df.shape)

# first 5 datapoints
df.head()

(16926, 2)

Unnamed: 0,headline,clickbait
0,13 Crucial Money-Saving Charts You Wish You Kn...,1
1,"This Couple Shares Their House With A ""Unicorn...",1
2,Bomb Kills 7 Afghan Civilians at U.S. Base,0
3,19 Reasons Why No One Should Ever Play Video G...,1
4,23 Dance Moves That Changed Our Lives In 2015,1


In [4]:
# preprocess documents
# remove special characters, stopwords
# lemmatization
clean_headlines = preprocess_corpus(df.headline)

clean_headlines.head()

0    crucial money saving chart wish knew sooner
1     couple share house unicorn beyond adorable
2                 bomb kill afghan civilian base
3                reason one ever play video game
4                        dance move changed life
Name: headline, dtype: object

In [5]:
# spawn a tfidf vectorizer
vectorizer = TfidfVectorizer()

# train and vectorize clean headlines
vectors = vectorizer.fit_transform(clean_headlines)

In [6]:
# extract tfidf vectors as dataframe
df_tfidf = pd.DataFrame(vectors.toarray(), columns=vectorizer.get_feature_names_out())

# add clickbait column to TF-IDF vector data
df_tfidf['clickbait'] = df['clickbait']

# shape
display(df_tfidf.shape)

# first 5 datapoints
df_tfidf.head()

(16926, 13959)

Unnamed: 0,aaevpc,aaron,abandon,abandoned,abandoning,abba,abbas,abbott,abby,abc,...,zoo,zoolander,zoom,zotob,zowie,zuckerberg,zuma,zurawski,zurich,clickbait
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
