In [1]:
#!pip install transformers

Collecting transformers
  Downloading transformers-4.35.2-py3-none-any.whl (7.9 MB)
[K     |████████████████████████████████| 7.9 MB 7.9 MB/s eta 0:00:01
Collecting safetensors>=0.3.1
  Downloading safetensors-0.4.0-cp39-cp39-macosx_10_7_x86_64.whl (439 kB)
[K     |████████████████████████████████| 439 kB 24.8 MB/s eta 0:00:01
[?25hCollecting tokenizers<0.19,>=0.14
  Downloading tokenizers-0.15.0-cp39-cp39-macosx_10_7_x86_64.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 27.3 MB/s eta 0:00:01
[?25hCollecting huggingface-hub<1.0,>=0.16.4
  Downloading huggingface_hub-0.19.4-py3-none-any.whl (311 kB)
[K     |████████████████████████████████| 311 kB 47.9 MB/s eta 0:00:01
Collecting fsspec>=2023.5.0
  Downloading fsspec-2023.10.0-py3-none-any.whl (166 kB)
[K     |████████████████████████████████| 166 kB 62.9 MB/s eta 0:00:01
Installing collected packages: fsspec, huggingface-hub, tokenizers, safetensors, transformers
  Attempting uninstall: fsspec
    Found existing i

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import torch
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from transformers import BertTokenizer, BertModel

#disabe annoying warnings
import warnings
warnings.filterwarnings('ignore')



nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/giulia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/giulia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/giulia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/giulia/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
print(torch.backends.mps.is_available())  

True


In [5]:
device = "mps" if torch.backends.mps.is_available() else "cpu"

In [2]:
news_train = 'MINDlarge_train/news.tsv'
news_test = 'MINDlarge_test/news.tsv'
news_val = 'MINDlarge_val/news.tsv'

#--------------------------------------------

def load_news_df(path):
    if 'news' in path:
        columns = ['News ID',
                "Category",
                "SubCategory",
                "Title",
                "Abstract",
                "URL",
                "Title Entities",
                "Abstract Entities "]
    
    elif 'behavior' in path:
        columns = ['Impression ID',
                "User ID",
                "Time",
                "History",
                "Impressions"]
    
    df = pd.read_csv(path, sep='\t', header=None, names=columns)
    
    return df

#--------------------------------------------

news_df = load_news_df(news_train)
news_df.shape

(101527, 8)

In [3]:
#remove url and rows w/ missing values
news_df = news_df.drop(columns=['URL'])
news_df = news_df.dropna()
news_df.shape

(96106, 7)

baseline

In [4]:
start_df = news_df.copy()
start_df = start_df.drop(columns=[
                "Title Entities",
                "Abstract Entities "])
bert_df = start_df.copy()
bert_df['combined_text'] = bert_df['Title'] + " " + bert_df['Abstract']


In [6]:
bert_df = bert_df["combined_text"]
bert_df.head()

0    The Brands Queen Elizabeth, Prince Charles, an...
1    Walmart Slashes Prices on Last-Generation iPad...
2    50 Worst Habits For Belly Fat These seemingly ...
4    The Cost of Trump's Aid Freeze in the Trenches...
5    I Was An NBA Wife. Here's How It Affected My M...
Name: combined_text, dtype: object

In [23]:
#cleaning & preprocessing
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
  
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]

    #string format
    text = ' '.join(filtered_tokens)

    return text


start_df['Title'] = start_df['Title'].apply(preprocess_text)
start_df['Abstract'] = start_df['Abstract'].apply(preprocess_text)
#start_df.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N88753,lifestyle,lifestyleroyals,brand queen elizabeth prince charles prince ph...,shop notebook jacket royal cant live without
1,N45436,news,newsscienceandtechnology,walmart slash price lastgeneration ipads,apple new ipad release bring big deal last yea...
2,N23144,health,weightloss,worst habit belly fat,seemingly harmless habit holding back keeping ...
4,N93187,news,newsworld,cost trump aid freeze trench ukraine war,lt ivan molchanets peeked parapet sand bag fro...
5,N75236,health,voices,nba wife here affected mental health,felt like fraud nba wife didnt help fact nearl...


In [9]:
""" 
#validation set
val_df = load_news_df(news_val)
val_df = val_df.drop(columns=['URL', 'Title Entities', 'Abstract Entities '])
val_df = val_df.dropna()
val_df['Title'] = val_df['Title'].apply(preprocess_text)
val_df['Abstract'] = val_df['Abstract'].apply(preprocess_text)
#show number of missing values
#missing_values = val_df.isnull().sum()  
#missing_values #0 everywhere
"""


News ID        0
Category       0
SubCategory    0
Title          0
Abstract       0
dtype: int64

In [10]:
#val_df.head()

Unnamed: 0,News ID,Category,SubCategory,Title,Abstract
0,N88753,lifestyle,lifestyleroyals,brand queen elizabeth prince charles prince ph...,shop notebook jacket royal cant live without
1,N23144,health,weightloss,worst habit belly fat,seemingly harmless habit holding back keeping ...
3,N93187,news,newsworld,cost trump aid freeze trench ukraine war,lt ivan molchanets peeked parapet sand bag fro...
4,N75236,health,voices,nba wife here affected mental health,felt like fraud nba wife didnt help fact nearl...
5,N99744,health,medical,get rid skin tag according dermatologist,seem harmless there good reason shouldnt ignor...


# feature extraction: 
tf-idf<br>
NOTA: look into SF-IDF aka using wordnet synonym to replace tf-idf. Problem: are extracted token semantically meaningfull enough??

In [2]:
import csv
num_tokens = 15 #to print

with open('tfidf_matrix.csv', 'r') as file:
    reader = csv.reader(file)
    first_row = next(reader)


first_few_tokens = first_row[1:num_tokens + 1] 
print(first_few_tokens)

['aac', 'aaron', 'aaron boone', 'aaron hick', 'aaron rodgers', 'ab', 'abandoned', 'abbott', 'abc', 'abc news', 'ability', 'able', 'able get', 'aboard', 'abortion']


In [24]:
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

In [25]:
# tf-idf train set
start_df['combined_text'] = start_df['Title'] + " " + start_df['Abstract']
tfidf_matrix = tfidf.fit_transform(start_df['combined_text'])

# tf-idf save matrix as .csv
save_path = 'tfidf_matrix.csv'
pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names()).to_csv(save_path, index=False)
# takes 7m 54s 


In [13]:
""" 
# validation set
val_df['combined_text'] = val_df['Title'] + " " + val_df['Abstract']
val_tfidf_matrix = tfidf.fit_transform(val_df['combined_text'])
pd.DataFrame(val_tfidf_matrix.toarray(), columns=tfidf.get_feature_names()).to_csv('val_tfidf_matrix.csv', index=False)
#takes 5m 37s
"""

word2vec

In [26]:
#train word2vec model
tokenized_text = [text.split() for text in start_df['combined_text']]
word2vec_model = Word2Vec(tokenized_text, vector_size=300, window=5, min_count=2, workers=4) #how to chose best hyperparameters?

#save model as .csv
pd.DataFrame(word2vec_model.wv.vectors).to_csv('word2vec_model.csv', index=False)
#25sec ?????

In [27]:
#validation word2vec model
val_tokenized_text = [text.split() for text in val_df['combined_text']]
word2vec_model = Word2Vec(val_tokenized_text, vector_size=300, window=5, min_count=2, workers=4)
pd.DataFrame(word2vec_model.wv.vectors).to_csv('val_word2vec_model.csv', index=False)
#20sec ?????

BERT

In [7]:
#load 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
print(torch.backends.mps.is_available())  

In [6]:
model = BertModel.from_pretrained('bert-base-uncased').to(device)

: 

: 

In [None]:
tokenized = [tokenizer(text, padding=True, truncation=True, return_tensors='pt') for text in bert_df['combined_text']] #return pytorch tensors
