In [2]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import XLNetTokenizer, XLNetModel, XLNetForSequenceClassification, AdamW
from sklearn.metrics import accuracy_score

In [3]:
# df = pd.read_csv('data/TBG_unique_raw.csv')

In [4]:
df = pd.read_csv('data/sample_1000rows.csv')

In [5]:
df.head()

Unnamed: 0,pub_type,position_section,position_subsection,hl1,hl2,author,lede,body,language,word_count,copyright,content-id,volume,issue_number,edition,pub_name,pub_date,licensor_indexing_terms,indexing_terms,year
0,"Newspaper, Newspapers",SPORTS,,Team up in air on Canada's pot issue,,,"calgary, alberta — some bruins rely on plant-b...",body while now legal in 13 nhl cities — seven ...,['ENGLISH'],912.0,Copyright 2018 Globe Newspaper Company All Rig...,BGLOBE-1a045630-d24e-11e8-83d5-dee7c961b652,,,,The Boston Globe,"{'day': '18', 'month': '10', 'year': '2018'}",,"{'subject': [{'score': '92', 'classCode': 'STX...",2018
1,"Newspaper, Newspapers",NEWS,National,Treasury worker is charged with leaks,,,washington — a senior treasury department empl...,body the charges reflect the latest move in th...,['ENGLISH'],273.0,Copyright 2018 Globe Newspaper Company All Rig...,BGLOBE-68f23a02-d238-11e8-8fdb-5c3af1605444,,,,The Boston Globe,"{'day': '18', 'month': '10', 'year': '2018'}",,"{'legal': [{'className': 'Banking Law', 'class...",2018
2,"Newspaper, Newspapers",LIVING ARTS,,"Funny Women luncheon raises record $950,000 fo...",,,edwidge danticat spoke at the luncheon to bene...,body at tuesday's funny women . . . serious bu...,['ENGLISH'],247.0,Copyright 2018 Globe Newspaper Company All Rig...,BGLOBE-0799bdb4-d23a-11e8-8fdb-5c3af1605444,,,,The Boston Globe,"{'day': '18', 'month': '10', 'year': '2018'}",,"{'subject': [{'score': '90', 'classCode': 'STX...",2018
3,"Newspaper, Newspapers",NEWS,Foreign,Turkish officials say audio reveals torture,,,istanbul — his killers were waiting when jamal...,"body khashoggi was dead within minutes, and wi...",['ENGLISH'],474.0,Copyright 2018 Globe Newspaper Company All Rig...,BGLOBE-d9f9ad0c-d238-11e8-8fdb-5c3af1605444,,,,The Boston Globe,"{'day': '18', 'month': '10', 'year': '2018'}",,"{'subject': [{'score': '92', 'classCode': 'N92...",2018
4,"Newspaper, Newspapers",NEWS,Metro,N.E. Journal pulls stem cell researcher's paper,Concern espressed about 2 other articles,,the new england journal of medicine on wednesd...,body anversa directed a lab at the brigham fro...,['ENGLISH'],635.0,Copyright 2018 Globe Newspaper Company All Rig...,BGLOBE-178e505e-d262-11e8-8bbe-65aa870fef5a,,,,The Boston Globe,"{'day': '18', 'month': '10', 'year': '2018'}",,"{'subject': [{'score': '90', 'classCode': 'ST0...",2018


In [6]:
# preprocess hl1 so that no empty headlines causing error
df['hl1'].fillna("No title", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['hl1'].fillna("No title", inplace=True)


In [7]:
headlines = df['hl1'].tolist()

In [8]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')

In [9]:
# positive, negative, neutral
model = XLNetForSequenceClassification.from_pretrained("xlnet/xlnet-base-cased", num_labels=3)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet/xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# Tokenize headlines
encoded_input = tokenizer(headlines, padding=True, truncation=True, return_tensors='pt', max_length=512)

In [11]:
with torch.no_grad():
    outputs = model(**encoded_input)
    predictions = torch.argmax(outputs.logits, dim=-1)

In [12]:
sentiment_labels = ['Negative', 'Neutral', 'Positive']
predicted_sentiments = [sentiment_labels[pred] for pred in predictions]
df['xlnet_sentiment'] = predicted_sentiments

In [13]:
df.to_csv('data/TBG_unique_raw_XLnet.csv', index=False)