# IMDB Movie Reviews - Data Exploration

Deep analysis to identify cinematic visualization opportunities.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import json

# Load dataset
df = pd.read_csv('../data/IMDB Dataset.csv')
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## 1. Basic Statistics

In [2]:
print('\nColumn info:')
df.info()

print('\nSentiment distribution:')
print(df['sentiment'].value_counts())

print('\nMissing values:')
print(df.isnull().sum())


Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB

Sentiment distribution:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64

Missing values:
review       0
sentiment    0
dtype: int64


## 2. Review Length Analysis

In [3]:
df['review_length'] = df['review'].str.len()
df['word_count'] = df['review'].str.split().str.len()

print('Review length statistics:')
print(df[['review_length', 'word_count']].describe())

print('\nLength by sentiment:')
print(df.groupby('sentiment')[['review_length', 'word_count']].mean())

Review length statistics:
       review_length    word_count
count   50000.000000  50000.000000
mean     1309.431020    231.156940
std       989.728014    171.343997
min        32.000000      4.000000
25%       699.000000    126.000000
50%       970.000000    173.000000
75%      1590.250000    280.000000
max     13704.000000   2470.000000

Length by sentiment:
           review_length  word_count
sentiment                           
negative      1294.06436   229.46456
positive      1324.79768   232.84932


## 3. Word Frequency Analysis (ULTRATHINK)

In [4]:
# Get top words by sentiment
def get_top_words(reviews, n=20):
    words = ' '.join(reviews).lower().split()
    # Remove common stopwords
    stopwords = {'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'is', 'it', 'this', 'that', 'was', 'as', 'with', 'by', 'from'}
    words = [w for w in words if w not in stopwords and len(w) > 3]
    return Counter(words).most_common(n)

pos_words = get_top_words(df[df['sentiment']=='positive']['review'])
neg_words = get_top_words(df[df['sentiment']=='negative']['review'])

print('Top 10 positive words:')
for word, count in pos_words[:10]:
    print(f'  {word}: {count}')

print('\nTop 10 negative words:')
for word, count in neg_words[:10]:
    print(f'  {word}: {count}')

Top 10 positive words:
  /><br: 48976
  film: 29367
  movie: 26681
  have: 24169
  they: 17669
  like: 16331
  very: 15791
  it's: 15574
  about: 15557
  some: 13978

Top 10 negative words:
  /><br: 51998
  movie: 34811
  have: 30254
  film: 25719
  they: 23191
  like: 20950
  just: 20544
  about: 16841
  some: 16187
  it's: 15638


## 4. Visualization Planning (ULTRATHINK)

Based on data exploration, here are 5 compelling visual stories:

In [5]:
# Save findings for visualization agent
findings = {
    'sentiment_counts': df['sentiment'].value_counts().to_dict(),
    'avg_length_by_sentiment': df.groupby('sentiment')['review_length'].mean().to_dict(),
    'top_positive_words': pos_words[:10],
    'top_negative_words': neg_words[:10],
    'visualizations': [
        {
            'id': 1,
            'style': 'Film Noir',
            'title': 'Sentiment Distribution: Light vs Dark',
            'description': 'High contrast visualization showing positive/negative review split',
            'data': 'sentiment_counts'
        },
        {
            'id': 2,
            'style': 'Studio Ghibli',
            'title': 'Review Length Distribution',
            'description': 'Soft, flowing visualization of review character counts',
            'data': 'review_length'
        },
        {
            'id': 3,
            'style': 'Wes Anderson',
            'title': 'Top Words: Positive vs Negative',
            'description': 'Symmetrical comparison of most frequent words',
            'data': 'top_words'
        },
        {
            'id': 4,
            'style': 'Blade Runner',
            'title': 'Word Count vs Sentiment',
            'description': 'Neon scatter plot showing review length patterns',
            'data': 'word_count'
        },
        {
            'id': 5,
            'style': 'Star Wars',
            'title': 'Epic Keyword Universe',
            'description': 'Bold visualization of top 20 keywords across all reviews',
            'data': 'all_top_words'
        }
    ]
}

# Add all top words for Star Wars viz
all_words = get_top_words(df['review'], n=20)
findings['all_top_words'] = all_words

# Save findings
with open('../notebooks/findings.json', 'w') as f:
    json.dump(findings, f, indent=2)

print('✅ Findings saved to findings.json')
print('\nVisualization Plan:')
for viz in findings['visualizations']:
    print(f"  {viz['id']}. [{viz['style']}] {viz['title']}")

✅ Findings saved to findings.json

Visualization Plan:
  1. [Film Noir] Sentiment Distribution: Light vs Dark
  2. [Studio Ghibli] Review Length Distribution
  3. [Wes Anderson] Top Words: Positive vs Negative
  4. [Blade Runner] Word Count vs Sentiment
  5. [Star Wars] Epic Keyword Universe
