In [1]:
import os
import polars as pl
from tqdm import tqdm

Load descriptions into dataframe alongside movie list. Maybe save as parquet or something to save on filesize. 

Then parse out whether it is a remake. Then save as a clean dataset.

Then let the analysis begin.

In [2]:
descriptions = os.listdir('data/raw/descriptions/')
filepath = 'data/raw/descriptions/'
prefix = 'https://en.wikipedia.org/api/rest_v1/page/summary/'
movies = pl.read_csv('data/raw/movies_list_with_links.csv').with_columns(
    link = pl.col('title_href').str.replace('/wiki/','')
).with_columns(
    filename= pl.concat_str(
        # pl.lit(filepath),
        pl.col('link'),
        pl.lit('.txt')
    )
).unique('filename').filter(pl.col('filename').is_not_null())

In [3]:
files = {}
for f in tqdm(descriptions):
    with open(f'{filepath}{f}') as txt_file:
        files[f] = txt_file.read()

  0%|          | 0/11824 [00:00<?, ?it/s]

100%|██████████| 11824/11824 [00:02<00:00, 4406.99it/s]


In [4]:
descriptions_df = pl.DataFrame({
    'filename': list(files.keys()),
    'description': list(files.values())
    }).unique('filename').filter(pl.col('filename').is_not_null())

In [6]:
movies['filename'].unique().len() == len(movies['filename'])

True

In [7]:
descriptions_df['filename'].unique().len() == len(descriptions_df)

True

In [12]:
joined = movies.join(
    descriptions_df,
    on = 'filename',
    how = 'inner'
)

In [63]:
with_context = joined.with_columns(
    description_lower = pl.col('description').str.to_lowercase()
).with_columns(
    sequel = (
        pl.col('description_lower').str.contains('sequel') | 
        pl.col('description_lower').str.contains('prequel')
    ),
    remake = (
        pl.col('description_lower').str.contains('remake') | 
        pl.col('description_lower').str.contains('reboot')
    ),
    date = pl.concat_str(
        'month',
        pl.lit(' '),
        'day',
        pl.lit(' '),
        'year',
        )
).with_columns(
    date_parsed = pl.col('date').str.to_date(format = '%B %d %Y')
).with_columns(
    month_start = pl.col('date_parsed').dt.month_start(),
).select(
    'title_text',
    'description',
    'description_lower',
    'sequel',
    'remake',
    pl.col('date_parsed').alias('release_date'),
    pl.col('month_start').alias('release_month')
)
with_context.write_parquet('data/processed/movie_data.parquet')
with_context.write_csv('data/processed/movie_data.csv')