In [1]:
import polars as pl
import plotly.express as px

In [3]:
movies = pl.read_parquet('data/processed/movie_data.parquet').with_columns(
    release_year = pl.col('release_date').dt.year()
).with_columns(
    release_year = pl.concat_str('release_year',pl.lit('-01-01'))
).with_columns(
    release_year = pl.col('release_year').str.to_date(format = '%F')
).with_columns(
    redo = pl.col('sequel') | pl.col('remake')
)

In [4]:
px.line(
    movies.group_by('release_month').len().sort('release_month'),
    x='release_month',
    y='len'
).update_layout(xaxis_title = 'Release Date', yaxis_title = 'Number of Movies')

In [5]:
px.line(
    movies.group_by('release_year').len().sort('release_year'),
    x='release_year',
    y='len'
).update_layout(xaxis_title = 'Release Date', yaxis_title = 'Number of Movies')

In [6]:
yearly = movies.group_by('release_year').agg(
    sequels = pl.sum('sequel'),
    remakes = pl.sum('remake'),
    redos = pl.sum('redo'),
    total = pl.len()
).with_columns(
    sequel_pct = (pl.col('sequels')/pl.col('total')),
    remake_pct = (pl.col('remakes')/pl.col('total')),
    redo_pct = (pl.col('redos')/pl.col('total')),
).sort('release_year')
fig = px.line(yearly, x='release_year',y='redo_pct')
fig

In [7]:
px.line(
    yearly.select('release_year','sequel_pct','remake_pct').unpivot(index='release_year'),
    x='release_year',
    y='value',
    color='variable'
)

In [8]:
production_companies = movies.explode('production_company').group_by('production_company').agg(
    count = pl.len(),
    remakes = pl.sum('remake'),
    sequels = pl.sum('sequel'),
    redos = pl.sum('redo')
).with_columns(
    remake_pct = pl.col('remakes') / pl.col('count'),
    sequel_pct = pl.col('sequels') / pl.col('count'),
    redo_pct = pl.col('redos') / pl.col('count'),
    size = pl.col('count').log()
)
px.scatter(production_companies,x='remake_pct',y='sequel_pct', size='size',hover_data=['remake_pct','sequel_pct','count'])

In [9]:
production_companies.sort(
        pl.col('remake_pct'),descending=True
    ).filter(
        pl.col('count')>= 10
    ).limit(10).select(
        'remake_pct','sequel_pct','production_company'
    ).unpivot(index='production_company')

production_company,variable,value
str,str,f64
"""Dark Castle Entertainment""","""remake_pct""",0.333333
"""Millennium Films""","""remake_pct""",0.230769
"""Peacock""","""remake_pct""",0.230769
"""Constantin Film""","""remake_pct""",0.230769
"""Bad Robot""","""remake_pct""",0.230769
…,…,…
"""Platinum Dunes""","""sequel_pct""",0.333333
"""Ghost House Pictures""","""sequel_pct""",0.142857
"""Alphaville Films""","""sequel_pct""",0.2
"""GK Films""","""sequel_pct""",0.1


In [10]:
px.bar(
    production_companies.sort(
        pl.col('redo_pct'),descending=True
    ).filter(
        pl.col('count')>= 10
    ).limit(10).select(
        'remake_pct','sequel_pct','production_company','redo_pct'
    ),
    x='production_company',
    y='redo_pct',
    
)

In [14]:
px.bar(
    production_companies.sort(
        pl.col('remake_pct'),descending=True
    ).filter(
        pl.col('count')>= 10
    ).limit(10).select(
        'remake_pct','sequel_pct','production_company'
    ).unpivot(index='production_company'),
    x='production_company',
    y='value',
    color='variable',
    barmode='stack'
    
)

In [12]:
production_companies.filter(pl.col('production_company') == 'Bad Robot')

production_company,count,remakes,sequels,redos,remake_pct,sequel_pct,redo_pct,size
str,u32,u32,u32,u32,f64,f64,f64,f64
"""Bad Robot""",13,3,5,7,0.230769,0.384615,0.538462,2.564949


In [13]:
px.scatter(
    production_companies,
    y='sequels',
    x='count',
    labels='production_company',
    hover_data=['production_company'],
    log_x=True,
    log_y = True,
    color='sequel_pct'
)