Data set is here:

https://ai.stanford.edu/~amaas/data/sentiment/

Extract the archive in the current directory.

In [1]:
import os
from tqdm import tqdm
import pandas as pd
from p_tqdm import p_map
import multiprocess as mp
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()



In [2]:
%%time

def read_review(rfile):
    with open(rfile, 'r', encoding='utf8') as revf:
        review = revf.read()
    return review

def list_of_files(path):
    file_list = [path + f for f in os.listdir(path)]
    return file_list

pos_files = list_of_files('./aclImdb/train/pos/')
neg_files = list_of_files('./aclImdb/train/neg/')

pos_reviews = p_map(read_review, pos_files)
neg_reviews = p_map(read_review, neg_files)

  0%|          | 0/12500 [00:00<?, ?it/s]

  0%|          | 0/12500 [00:00<?, ?it/s]

Wall time: 11 s


In [3]:
df_pos = pd.DataFrame(data={'label': ['pos'] * len(pos_reviews),
                            'review': pos_reviews})
df_neg = pd.DataFrame(data={'label': ['neg'] * len(neg_reviews),
                           'review': neg_reviews})

df = pd.concat([df_pos, df_neg], ignore_index=True)
df = df.sample(frac=0.1)

In [4]:
def get_score(review, sid=sid):
    score = sid.polarity_scores(review)
    return score

review_list = df['review'].tolist()

In [5]:
%%time

df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df.head()

Wall time: 5.26 s


Unnamed: 0,label,review,scores
6318,pos,This police procedural is no worse than many o...,"{'neg': 0.081, 'neu': 0.762, 'pos': 0.157, 'co..."
1208,pos,The villian in this movie is one mean sob and ...,"{'neg': 0.025, 'neu': 0.814, 'pos': 0.162, 'co..."
12090,pos,The concept of having Laurel & Hardy this time...,"{'neg': 0.035, 'neu': 0.663, 'pos': 0.302, 'co..."
15503,neg,I'm not sure it was the language or the poor a...,"{'neg': 0.175, 'neu': 0.743, 'pos': 0.082, 'co..."
21141,neg,Very bad but watchable science fiction film th...,"{'neg': 0.122, 'neu': 0.758, 'pos': 0.12, 'com..."


In [6]:
%%time

score_list = p_map(get_score, review_list)
df['scores'] = score_list
df.head()

  0%|          | 0/2500 [00:00<?, ?it/s]

Wall time: 1min 19s


Unnamed: 0,label,review,scores
6318,pos,This police procedural is no worse than many o...,"{'neg': 0.081, 'neu': 0.762, 'pos': 0.157, 'co..."
1208,pos,The villian in this movie is one mean sob and ...,"{'neg': 0.025, 'neu': 0.814, 'pos': 0.162, 'co..."
12090,pos,The concept of having Laurel & Hardy this time...,"{'neg': 0.035, 'neu': 0.663, 'pos': 0.302, 'co..."
15503,neg,I'm not sure it was the language or the poor a...,"{'neg': 0.175, 'neu': 0.743, 'pos': 0.082, 'co..."
21141,neg,Very bad but watchable science fiction film th...,"{'neg': 0.122, 'neu': 0.758, 'pos': 0.12, 'com..."


In [7]:
%%time

# about 2 sec are wasted initializing pool.map()
# the actual calculation is faster
with mp.Pool() as pool:
    score_list2 = pool.map(get_score, review_list)
df['scores'] = score_list2
df.head()

Wall time: 3.65 s


Unnamed: 0,label,review,scores
6318,pos,This police procedural is no worse than many o...,"{'neg': 0.081, 'neu': 0.762, 'pos': 0.157, 'co..."
1208,pos,The villian in this movie is one mean sob and ...,"{'neg': 0.025, 'neu': 0.814, 'pos': 0.162, 'co..."
12090,pos,The concept of having Laurel & Hardy this time...,"{'neg': 0.035, 'neu': 0.663, 'pos': 0.302, 'co..."
15503,neg,I'm not sure it was the language or the poor a...,"{'neg': 0.175, 'neu': 0.743, 'pos': 0.082, 'co..."
21141,neg,Very bad but watchable science fiction film th...,"{'neg': 0.122, 'neu': 0.758, 'pos': 0.12, 'com..."
