In [1]:
import os
import polars as pl
from tqdm import tqdm

Load descriptions into dataframe alongside movie list. Maybe save as parquet or something to save on filesize. 

Then parse out whether it is a remake. Then save as a clean dataset.

Then let the analysis begin.

In [2]:
descriptions = os.listdir('data/raw/descriptions/')
filepath = 'data/raw/descriptions/'
prefix = 'https://en.wikipedia.org/api/rest_v1/page/summary/'
movies = pl.read_csv('data/raw/movies_list_with_links.csv').with_columns(
    link = pl.col('title_href').str.replace('/wiki/','')
).with_columns(
    filename= pl.concat_str(
        # pl.lit(filepath),
        pl.col('link'),
        pl.lit('.txt')
    )
).unique('filename').filter(pl.col('filename').is_not_null())

In [3]:
files = {}
for f in tqdm(descriptions):
    with open(f'{filepath}{f}') as txt_file:
        files[f] = txt_file.read()

  0%|          | 0/11824 [00:00<?, ?it/s]

100%|██████████| 11824/11824 [00:02<00:00, 4406.99it/s]


In [4]:
descriptions_df = pl.DataFrame({
    'filename': list(files.keys()),
    'description': list(files.values())
    }).unique('filename').filter(pl.col('filename').is_not_null())

In [5]:
len(descriptions_df)

11824

In [6]:
movies['filename'].unique().len() == len(movies['filename'])

True

In [7]:
descriptions_df['filename'].unique().len() == len(descriptions_df)

True

In [12]:
joined = movies.join(
    descriptions_df,
    on = 'filename',
    how = 'inner'
)

In [54]:
with_context = joined.with_columns(
    description_lower = pl.col('description').str.to_lowercase()
).with_columns(
    sequel = (
        pl.col('description_lower').str.contains('sequel') | 
        pl.col('description_lower').str.contains('prequel')
    ),
    remake = (
        pl.col('description_lower').str.contains('remake') | 
        pl.col('description_lower').str.contains('reboot')
    ),
    date = pl.concat_str(
        'month',
        pl.lit(' '),
        'day',
        pl.lit(' '),
        'year',
        )
).with_columns(
    date_parsed = pl.col('date').str.to_date(format = '%B %d %Y')
)
with_context

year,month,day,title,production_company,cast_and_crew,title_text,title_href,production_company_text,production_company_href,link,filename,description,description_lower,sequel,remake,date,date_parsed
i64,str,i64,str,str,str,str,str,str,str,str,str,str,str,bool,bool,str,date
2003,"""MARCH""",28,"""('Assassination Tango', '/wiki…","""('United Artists / American Zo…","""('Robert Duvall (director/scre…","""Assassination Tango""","""/wiki/Assassination_Tango""","""United Artists / American Zoet…","""/wiki/United_Artists""","""Assassination_Tango""","""Assassination_Tango.txt""","""Assassination Tango is a 2002 …","""assassination tango is a 2002 …",false,false,"""MARCH 28 2003""",2003-03-28
1977,"""MARCH""",11,"""('Black Sunday', '/wiki/Black_…","""('Paramount Pictures', '/wiki/…","""('John Frankenheimer (director…","""Black Sunday""","""/wiki/Black_Sunday_(1977_film)""","""Paramount Pictures""","""/wiki/Paramount_Pictures""","""Black_Sunday_(1977_film)""","""Black_Sunday_(1977_film).txt""","""Black Sunday is a 1977 America…","""black sunday is a 1977 america…",false,false,"""MARCH 11 1977""",1977-03-11
1995,"""FEBRUARY""",24,"""('The Walking Dead', '/wiki/Th…","""('Savoy Pictures', '/wiki/Savo…","""('Preston A. Whitmore II (dire…","""The Walking Dead""","""/wiki/The_Walking_Dead_(1995_f…","""Savoy Pictures""","""/wiki/Savoy_Pictures""","""The_Walking_Dead_(1995_film)""","""The_Walking_Dead_(1995_film).t…","""The Walking Dead is a 1995 war…","""the walking dead is a 1995 war…",false,false,"""FEBRUARY 24 1995""",1995-02-24
2000,"""OCTOBER""",3,"""('Scooby-Doo and the Alien Inv…","""('Warner Home Video', '/wiki/W…","""('Jim Stenstrum (director); Da…","""Scooby-Doo and the Alien Invad…","""/wiki/Scooby-Doo_and_the_Alien…","""Warner Home Video""","""/wiki/Warner_Bros._Home_Entert…","""Scooby-Doo_and_the_Alien_Invad…","""Scooby-Doo_and_the_Alien_Invad…","""Scooby-Doo and the Alien Invad…","""scooby-doo and the alien invad…",false,false,"""OCTOBER 3 2000""",2000-10-03
2007,"""MARCH""",16,"""('I Think I Love My Wife', '/w…","""('Fox Searchlight Pictures', '…","""('Chris Rock (director/screenp…","""I Think I Love My Wife""","""/wiki/I_Think_I_Love_My_Wife""","""Fox Searchlight Pictures""","""/wiki/Searchlight_Pictures""","""I_Think_I_Love_My_Wife""","""I_Think_I_Love_My_Wife.txt""","""I Think I Love My Wife is a 20…","""i think i love my wife is a 20…",false,true,"""MARCH 16 2007""",2007-03-16
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
1997,"""DECEMBER""",31,"""('Oscar and Lucinda', '/wiki/O…","""('Fox Searchlight Pictures', '…","""('Gillian Armstrong (director)…","""Oscar and Lucinda""","""/wiki/Oscar_and_Lucinda_(film)""","""Fox Searchlight Pictures""","""/wiki/Searchlight_Pictures""","""Oscar_and_Lucinda_(film)""","""Oscar_and_Lucinda_(film).txt""","""Oscar and Lucinda is a 1997 ro…","""oscar and lucinda is a 1997 ro…",false,false,"""DECEMBER 31 1997""",1997-12-31
1987,"""JUNE""",5,"""('Cyclone', '/wiki/Cyclone_(19…","""('CineTel Films', '/wiki/CineT…","""('Fred Olen Ray (director/scre…","""Cyclone""","""/wiki/Cyclone_(1987_film)""","""CineTel Films""","""/wiki/CineTel_Films""","""Cyclone_(1987_film)""","""Cyclone_(1987_film).txt""","""Cyclone is a 1987 science fict…","""cyclone is a 1987 science fict…",false,false,"""JUNE 5 1987""",1987-06-05
2020,"""OCTOBER""",6,"""('The Lie', '/wiki/The_Lie_(20…","""('Amazon Studios / Blumhouse P…","""('Veena Sud (director/screenpl…","""The Lie""","""/wiki/The_Lie_(2018_film)""","""Amazon Studios / Blumhouse Pro…","""/wiki/Amazon_Studios""","""The_Lie_(2018_film)""","""The_Lie_(2018_film).txt""","""The Lie is a 2018 psychologica…","""the lie is a 2018 psychologica…",false,true,"""OCTOBER 6 2020""",2020-10-06
2003,"""JULY""",2,"""('Sinbad: Legend of the Seven …","""('DreamWorks', '/wiki/DreamWor…","""('Tim Johnson, Patrick Gilmore…","""Sinbad: Legend of the Seven Se…","""/wiki/Sinbad:_Legend_of_the_Se…","""DreamWorks""","""/wiki/DreamWorks_Pictures""","""Sinbad:_Legend_of_the_Seven_Se…","""Sinbad:_Legend_of_the_Seven_Se…","""Sinbad: Legend of the Seven Se…","""sinbad: legend of the seven se…",false,false,"""JULY 2 2003""",2003-07-02
