# IMDB Scraper

Script to scrape reviews and ratings of a movie / tvshow from IMDB.

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from bs4 import BeautifulSoup
import requests
import json
from matplotlib import pyplot as plt

Configuration

In [2]:
DATA_PATH = 'data.tsv'

Read tsv

In [3]:
df = pd.read_csv(DATA_PATH, sep = '\t')

In [4]:
df.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1924
1,tt0000002,5.8,260
2,tt0000003,6.5,1736
3,tt0000004,5.6,175
4,tt0000005,6.2,2551


Row count

In [5]:
len(df.index)

1252779

Randomly sample data

In [6]:
sample_idxs = np.random.permutation(len(df.index))

Exclude previously mined data

In [7]:
# load all previously trained data
prev_data = {}
for i in range(28):
    with open(f'./data/ratings_{i}.pkl', 'rb') as f:
        prev_data = prev_data | pickle.load(f)
# extract existing keys
mined_tids = set(prev_data.keys())
print(len(mined_tids))

TypeError: unsupported operand type(s) for |: 'dict' and 'dict'

## Data Mining

Mine data from the web and save to pickle file

In [None]:
SAMPLE_SIZE = 20000
SAVE_INTERVAL = 1000
REVIEWS_PER_MOVIE = 5
SAVE_LOCATION = './data/'
START_ID = 28

In [None]:
data = {}
count = 0
save_counter = START_ID
with tqdm(total=SAMPLE_SIZE) as pbar:
    for idx in sample_idxs:
        # check if exit loop
        if count >= SAMPLE_SIZE:
            break
        # get metadata
        metadata = df.iloc[idx]
        tid, rating = metadata[0], metadata[1]
        # exclude already mined data
        if tid in mined_tids:
            continue
        # scrape reviews
        raw = None
        try:
            raw = requests.get(f'https://www.imdb.com/title/{tid}/reviews?ref_=tt_urv')
        except:
            print(f'Error while mining {tid} ...')
            continue
        soup = BeautifulSoup(raw.text, 'html.parser')
        r_text = []
        for i, r_div in enumerate(soup.findAll('div', {'class': ['content']})):
            review = r_div.find(class_ = 'text')
            if i >= REVIEWS_PER_MOVIE:
                break
            r_text.append(review.text)
        if len(r_text) == 0:
            # no review found, continue searching
            continue
        # store data
        data[tid] = (rating, ' '.join(r_text))
        count += 1
        pbar.update(1)
        # save data if interval
        if count % SAVE_INTERVAL == 0:
            # dump data
            print('saving checkpoint...')
            with open(f'{SAVE_LOCATION}ratings_{save_counter}.pkl', 'wb') as f:
                pickle.dump(data, f)
            save_counter += 1
            # clear data
            data.clear()
# save data
print('done')

Train/Dev/Test Split

In [None]:
ratio = [24,2,2]
FILE_COUNT = 28

ratio = np.cumsum(np.array(ratio))
names = ['train', 'dev', 'test']
ratings = {}
mode = 0
for i in range(FILE_COUNT):
    with open(f'{SAVE_LOCATION}ratings_{i}.pkl', 'rb') as handle:
        temp = pickle.load(handle)
        # merge
        ratings = ratings | temp
    # dump file
    if i == ratio[mode] - 1:
        with open(f'{SAVE_LOCATION}processed/data_{names[mode]}.pkl', 'wb') as f:
            pickle.dump(ratings, f)
        mode += 1
        ratings.clear()

## Data visualization

Load pickled data

In [None]:
train_data = None
dev_data = None
test_data = None
with open(f'{SAVE_LOCATION}processed/data_train.pkl', 'rb') as handle:
    train_data = pickle.load(handle)
with open(f'{SAVE_LOCATION}processed/data_dev.pkl', 'rb') as handle:
    dev_data = pickle.load(handle)
with open(f'{SAVE_LOCATION}processed/data_test.pkl', 'rb') as handle:
    test_data = pickle.load(handle)

Size of dataset

In [None]:
print(f'Dataset Size: train = {len(train_data)}, dev = {len(dev_data)}, test = {len(test_data)}')

Ratings distribution

In [None]:
train_scores = []
for k, v in train_data.items():
    train_scores.append(v[0])
dev_scores = []
for k, v in dev_data.items():
    dev_scores.append(v[0])
test_scores = []
for k, v in test_data.items():
    test_scores.append(v[0])
dist = [np.array(train_scores), np.array(dev_scores), np.array(test_scores)]
plt.hist(dist, density = True, label=['Train', 'Dev', 'Test'])
plt.title('Distribution of Ratings for each Dataset')
plt.xlabel('Ratings')
plt.ylabel('Density')
plt.legend()
plt.show()

# Twitter Scraper

Script to scrape twitter posts with a specific movie hashtag

In [None]:
SAVE_LOCATION = './twitter/'
RESULTS = 50

In [None]:
hashtags = [
    (7.3, '#blackpanther2'),
    (8.4, '#TopGun'),
    (6.9, '#NopeMovie'),
    (6.3, '#ThorLoveAndThunder')
]

In [None]:
twitter_data = []
for score, hashtag in hashtags:
    bearer_token = 'TOKEN'
    def bearer_oauth(r):
        r.headers["Authorization"] = f"Bearer {bearer_token}"
        r.headers["User-Agent"] = "v2RecentSearchPython"
        return r
    params = {'query': f'{hashtag} lang:en', 'max_results': RESULTS}
    url = 'https://api.twitter.com/2/tweets/search/recent'
    res = requests.get(url, auth=bearer_oauth, params=params).json()
    data = [r['text'] for r in res['data']]
    twitter_data.append((hashtag, score, data))
with open(f'{SAVE_LOCATION}/data.pkl', 'wb') as f:
    pickle.dump(twitter_data, f)