In [20]:
!pip install transformers



In [0]:
import numpy as np
import pandas as pd
import torch
import json
import transformers as t
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
eco_news_pd = pd.read_json('./drive/My Drive/datasetEconomyNews_PN.json')

In [24]:
eco_news_pd

Unnamed: 0,classification,headlineText,headlineTitle,idx,hour,siteAndDate
0,1,"President Trump said there was a “very, very g...","U.S. and China Extend Talks, but Final Deal Re...",0.0,,
1,1,Despite talk that the dollar was losing domina...,The Dollar Is Still King. How (in the World) D...,1.0,,
2,-1,The Fed released minutes of its January meetin...,Fed Explains Pause as Officials Debate Future ...,6.0,,
3,-1,A proposal to tax wealth finds support across ...,Democrats Want to Tax the Wealthy. Many Voters...,7.0,,
4,1,A decline in residential real estate has led s...,Housing Is Already in a Slump. So It (Probably...,8.0,,
...,...,...,...,...,...,...
557,-1,Closely followed strategist Jim Paulsen told C...,Strategist Jim Paulsen: The economy may soon l...,548.0,,CNBC-16 de out de 2017
558,-1,President Trump gave U.S. steel companies a sh...,Steel stocks jump on Trump's tariff threat,550.0,,CNNMoney-13 de jul de 2017
559,1,"Earlier this year, Juul Labs quietly spun out ...",E-cigarette maker Juul is raising $150 million...,558.0,,CNBC-19 de dez de 2017
560,1,Here's why FANG stocks will get stronger even ...,Here's why FANG stocks will get stronger even ...,560.0,,CNBC-7 de jun de 2017


In [0]:
#train_valid, test = train_test_split(eco_news_pd, test_size=0.2)

In [0]:
#train, valid = train_test_split(train_valid, test_size=0.2)

In [0]:
# Importing pre-trained DistilBERT model and tokenizer
model_class, tokenizer_class, pretrained_weights = (t.BertModel, t.BertTokenizer, 'bert-base-uncased')

In [0]:
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

In [0]:
tokenized_htext = eco_news_pd['headlineText'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [43]:
tokenized_htext

0      [101, 2343, 8398, 2056, 2045, 2001, 1037, 1523...
1      [101, 2750, 2831, 2008, 1996, 7922, 2001, 3974...
2      [101, 1996, 7349, 2207, 2781, 1997, 2049, 2254...
3      [101, 1037, 6378, 2000, 4171, 7177, 4858, 2490...
4      [101, 1037, 6689, 1999, 5647, 2613, 3776, 2038...
                             ...                        
557    [101, 4876, 2628, 2358, 11657, 24063, 3958, 27...
558    [101, 2343, 8398, 2435, 1057, 1012, 1055, 1012...
559    [101, 3041, 2023, 2095, 1010, 18414, 5313, 136...
560    [101, 2182, 1005, 1055, 2339, 15197, 15768, 20...
561    [101, 2021, 1010, 1999, 3522, 2086, 1010, 1996...
Name: headlineText, Length: 562, dtype: object

In [44]:
type(tokenized_htext)

pandas.core.series.Series

In [0]:
max_len = 0
for sent in tokenized_htext.values:
    if len(sent) > max_len:
        max_len = len(sent)

padded_tokenized_htext = np.array([sent + [0]*(max_len - len(sent)) for sent in tokenized_htext.values])

In [46]:
padded_tokenized_htext.shape

(562, 80)

In [47]:
attention_mask = np.where(padded_tokenized_htext != 0, 1, 0)
attention_mask.shape

(562, 80)

In [0]:
input = torch.tensor(padded_tokenized_htext)
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    last_hidden_state = model(input, attention_mask=attention_mask)

In [0]:
features = last_hidden_state[0][:,0,:]

In [0]:
labels = eco_news_pd['classification']

In [56]:
labels.head()

0    1
1    1
2   -1
3   -1
4    1
Name: classification, dtype: int64

In [0]:
train_features, test_features, train_labels, test_labels = train_test_split(features, labels)

In [61]:
lr_clf = LogisticRegression(max_iter=300)
lr_clf.fit(train_features, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=300,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [62]:
lr_clf.score(test_features, test_labels)

0.7375886524822695

In [0]:
predict_test = lr_clf.predict(test_features)

In [71]:
f1_score(test_labels, predict_test)

0.6336633663366336

In [63]:

from sklearn.dummy import DummyClassifier
dm_clf = DummyClassifier()

scores = cross_val_score(dm_clf, train_features, train_labels)
print("Dummy classifier score: %0.3f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Dummy classifier score: 0.560 (+/- 0.15)


