In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/goodreads-books-reviews-290312/goodreads_test.csv
/kaggle/input/goodreads-books-reviews-290312/goodreads_sample_submission.csv
/kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv


In [2]:
df = pd.read_csv('/kaggle/input/goodreads-books-reviews-290312/goodreads_train.csv')

In [3]:
import string
import re

PUNCT_TO_REMOVE = string.punctuation

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

def remove_spoiler_alert(text):
    spoiler = re.compile(r'(\(view spoiler\).*?\(hide spoiler\))') 
    return spoiler.sub(r' ', text)

def process_text(text):
    text = remove_spoiler_alert(text)
    text = remove_urls(text)
    text = remove_html(text)
    text = remove_punctuation(text)
    return text

In [4]:
df['review_text'] = df['review_text'].apply(process_text)

In [5]:
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [6]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

Downloading:   0%|          | 0.00/747 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/476M [00:00<?, ?B/s]

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)

def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt', truncation=True, padding='max_length', max_length=512).to(device)
    output = model(**encoded_text)
    scores = output[0][0].detach().cpu().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [8]:
from tqdm import tqdm

In [10]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    if i>100: break
    try:
        text = row['review_text']
#         print(text)
        myid = row['review_id']
        res[myid] = polarity_scores_roberta(text)
    except RuntimeError:
        print(f'Broke for id {myid}')

  0%|          | 101/900000 [00:02<6:33:49, 38.08it/s]


This is a special book It started slow for about the first third then in the middle third it started to get interesting then the last third blew my mind This is what I love about good science fiction  it pushes your thinking about where things can go 
 It is a 2015 Hugo winner and translated from its original Chinese which made it interesting in just a different way from most things Ive read For instance the intermixing of Chinese revolutionary history  how they kept accusing people of being reactionaries etc 
 It is a book about science and aliens The science described in the book is impressive  its a book grounded in physics and pretty accurate as far as I could tell view spoilerThough when it got to folding protons into 8 dimensions I think he was just making stuff up  interesting to think about though 
 But what would happen if our SETI stations received a message  if we found someone was out there  and the person monitoring and answering the signal on our side was disillusioned Th

{'dfdbb7b0eb5a7e4c26d59a937e2e5feb': {'roberta_neg': 0.073313296,
  'roberta_neu': 0.32138458,
  'roberta_pos': 0.6053021},
 'a5d2c3628987712d0e05c4f90798eb67': {'roberta_neg': 0.0058836434,
  'roberta_neu': 0.7433713,
  'roberta_pos': 0.25074515},
 '2ede853b14dc4583f96cf5d120af636f': {'roberta_neg': 0.22637199,
  'roberta_neu': 0.49619523,
  'roberta_pos': 0.27743277},
 'ced5675e55cd9d38a524743f5c40996e': {'roberta_neg': 0.2667239,
  'roberta_neu': 0.65349305,
  'roberta_pos': 0.07978306},
 '332732725863131279a8e345b63ac33e': {'roberta_neg': 0.018251916,
  'roberta_neu': 0.19206282,
  'roberta_pos': 0.7896852},
 'ea4a220b10e6b5c796dae0e3b970aff1': {'roberta_neg': 0.11384473,
  'roberta_neu': 0.41757876,
  'roberta_pos': 0.46857646},
 '5fe9882bfe4b0520a322820c4c55747d': {'roberta_neg': 0.0043576285,
  'roberta_neu': 0.037338797,
  'roberta_pos': 0.9583036},
 'dbc01e2438df7a87ee3dc16ee23a53e5': {'roberta_neg': 0.015462321,
  'roberta_neu': 0.5386442,
  'roberta_pos': 0.4458935},
 '75471