# Exploratory Data Analysis for IMDB Movie Review Data

In [27]:
import pandas as pd
from fastai.data.external import fastai_cfg, URLs, untar_data

imdb_path = untar_data(URLs.IMDB)
imdb_sample_path = untar_data(URLs.IMDB_SAMPLE)

pathlib.PosixPath

In [49]:
from itertools import islice
for x in imdb_path.ls(): 
    print(x)
    if x.is_dir():
        for s in islice(x.ls(), 3):
            print(f".. {s}")

/home/yzhao/.fastai/data/imdb/README
/home/yzhao/.fastai/data/imdb/test
.. /home/yzhao/.fastai/data/imdb/test/pos
.. /home/yzhao/.fastai/data/imdb/test/neg
.. /home/yzhao/.fastai/data/imdb/test/labeledBow.feat
/home/yzhao/.fastai/data/imdb/unsup
.. /home/yzhao/.fastai/data/imdb/unsup/35167_0.txt
.. /home/yzhao/.fastai/data/imdb/unsup/45392_0.txt
.. /home/yzhao/.fastai/data/imdb/unsup/10591_0.txt
/home/yzhao/.fastai/data/imdb/imdb.vocab
/home/yzhao/.fastai/data/imdb/tmp_lm
.. /home/yzhao/.fastai/data/imdb/tmp_lm/itos.pkl
.. /home/yzhao/.fastai/data/imdb/tmp_lm/train_ids.npy
.. /home/yzhao/.fastai/data/imdb/tmp_lm/train_lbl.npy
/home/yzhao/.fastai/data/imdb/train
.. /home/yzhao/.fastai/data/imdb/train/pos
.. /home/yzhao/.fastai/data/imdb/train/unsupBow.feat
.. /home/yzhao/.fastai/data/imdb/train/neg
/home/yzhao/.fastai/data/imdb/tmp_clas
.. /home/yzhao/.fastai/data/imdb/tmp_clas/itos.pkl
.. /home/yzhao/.fastai/data/imdb/tmp_clas/train_ids.npy
.. /home/yzhao/.fastai/data/imdb/tmp_clas/tra

In [45]:
for x in imdb_sample_path.ls(): print(x)

/home/yzhao/.fastai/data/imdb_sample/texts.csv


In [7]:
imdb_sample = pd.read_csv(imdb_sample_path / "texts.csv")
imdb_sample.head(5)

Unnamed: 0,label,text,is_valid
0,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
1,positive,"This is a extremely well-made film. The acting, script and camera-work are all first-rate. The music is good, too, though it is mostly early in the film, when things are still relatively cheery. There are no really superstars in the cast, though several faces will be familiar. The entire cast does an excellent job with the script.<br /><br />But it is hard to watch, because there is no good end to a situation like the one presented. It is now fashionable to blame the British for setting Hindus and Muslims against each other, and then cruelly separating them into two countries. There is som...",False
2,negative,"Every once in a long while a movie will come along that will be so awful that I feel compelled to warn people. If I labor all my days and I can save but one soul from watching this movie, how great will be my joy.<br /><br />Where to begin my discussion of pain. For starters, there was a musical montage every five minutes. There was no character development. Every character was a stereotype. We had swearing guy, fat guy who eats donuts, goofy foreign guy, etc. The script felt as if it were being written as the movie was being shot. The production value was so incredibly low that it felt li...",False
3,positive,"Name just says it all. I watched this movie with my dad when it came out and having served in Korea he had great admiration for the man. The disappointing thing about this film is that it only concentrate on a short period of the man's life - interestingly enough the man's entire life would have made such an epic bio-pic that it is staggering to imagine the cost for production.<br /><br />Some posters elude to the flawed characteristics about the man, which are cheap shots. The theme of the movie ""Duty, Honor, Country"" are not just mere words blathered from the lips of a high-brassed offic...",False
4,negative,"This movie succeeds at being one of the most unique movies you've seen. However this comes from the fact that you can't make heads or tails of this mess. It almost seems as a series of challenges set up to determine whether or not you are willing to walk out of the movie and give up the money you just paid. If you don't want to feel slighted you'll sit through this horrible film and develop a real sense of pity for the actors involved, they've all seen better days, but then you realize they actually got paid quite a bit of money to do this and you'll lose pity for them just like you've alr...",False


In [8]:
from fastai.data.load import DataLoader
dl = DataLoader(imdb_sample, bs=8)
dl.describe()

Unnamed: 0,label,text,is_valid
count,1000,1000,1000
unique,2,1000,2
top,negative,"Un-bleeping-believable! Meg Ryan doesn't even look her usual pert lovable self in this, which normally makes me forgive her shallow ticky acting schtick. Hard to believe she was the producer on this dog. Plus Kevin Kline: what kind of suicide trip has his career been on? Whoosh... Banzai!!! Finally this was directed by the guy who did Big Chill? Must be a replay of Jonestown - hollywood style. Wooofff!",False
freq,524,1,800


#### Tokenization

We are going to use tokenizer provided by Hugging Face `transformers`.

In [9]:
from datasets import Dataset, DatasetDict
ds = Dataset.from_pandas(imdb_sample)
ds

Dataset({
    features: ['label', 'text', 'is_valid'],
    num_rows: 1000
})

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
tokz = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
tokz.tokenize("Good morning everybody! My name is FAKEFIRSTNAME OIHSDFLIHA")

['▁Good',
 '▁morning',
 '▁everybody',
 '!',
 '▁My',
 '▁name',
 '▁is',
 '▁FAKE',
 'FIRST',
 'NAME',
 '▁O',
 'IHS',
 'DF',
 'LI',
 'HA']

In [14]:
tok_ds = ds.map(lambda x: tokz(x["text"]), batched=True)
tok_ds

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset({
    features: ['label', 'text', 'is_valid', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [15]:
dds = tok_ds.train_test_split(0.25)
dds

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'is_valid', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 750
    })
    test: Dataset({
        features: ['label', 'text', 'is_valid', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 250
    })
})