In [None]:
import pandas as pd
import numpy as np
import re

In [4]:
# Loading CSV to dataframe. Using column "position" as index.
filmweb_df = pd.read_csv('filmweb_top500.csv', index_col=0) 

Creating a new column with the duration converted to minutes to allow calculations on the duration data.

In [8]:
def duration_to_minutes(duration):
    match = re.match(r'(?:(\d+)h)?\s*(?:(\d+)m)?', duration.strip())
    if not match:
        return None
    hours = int(match.group(1)) if match.group(1) else 0
    minutes = int(match.group(2)) if match.group(2) else 0
    return hours * 60 + minutes

filmweb_df['duration_minutes'] = filmweb_df['duration'].apply(duration_to_minutes)

In [11]:
filmweb_df[['duration', 'duration_minutes']].head()

Unnamed: 0_level_0,duration,duration_minutes
position,Unnamed: 1_level_1,Unnamed: 2_level_1
1,2h 22m,142
2,3h 8m,188
3,1h 52m,112
4,2h 55m,175
5,1h 36m,96


Cleaning and standardizing date columns to fix errors and ensure consistency.

In [None]:
months = {
    'stycznia': '01', 'lutego': '02', 'marca': '03', 'kwietnia': '04', 'maja': '05', 'czerwca': '06',
    'lipca': '07', 'sierpnia': '08', 'września': '09', 'października': '10', 'listopada': '11', 'grudnia': '12'
}

def parse_date(date_str):
    if pd.isna(date_str):
        return np.nan
    date_str = str(date_str).strip()
    if re.match(r'\d{2}\.\d{2}\.\d{4}', date_str):
        return pd.to_datetime(date_str, format='%d.%m.%Y', errors='coerce')
    match = re.match(r'([a-ząćęłńóśźż]+)\s+(\d{4})', date_str, re.IGNORECASE)
    if match:
        month = months.get(match.group(1).lower())
        year = match.group(2)
        if month:
            return pd.to_datetime(f'01.{month}.{year}', format='%d.%m.%Y', errors='coerce')
    return np.nan


filmweb_df['world_premiere_parsed'] = filmweb_df['world_premiere'].apply(parse_date)
filmweb_df['polish_premiere_parsed'] = filmweb_df['polish_premiere'].apply(parse_date)

In [40]:
filmweb_df[['world_premiere','world_premiere_parsed','polish_premiere','polish_premiere_parsed']].head(10)

Unnamed: 0_level_0,world_premiere,world_premiere_parsed,polish_premiere,polish_premiere_parsed
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,10.09.1994,1994-09-10,16.04.1995,1995-04-16
2,06.12.1999,1999-12-06,24.03.2000,2000-03-24
3,23.09.2011,2011-09-23,13.04.2012,2012-04-13
4,14.03.1972,1972-03-14,31.12.1972,1972-12-31
5,10.04.1957,1957-04-10,listopada 1959,1959-11-01
6,23.06.1994,1994-06-23,04.11.1994,1994-11-04
7,listopada 1975,1975-11-01,31.12.1975,1975-12-31
8,12.12.1974,1974-12-12,,NaT
9,01.12.2003,2003-12-01,01.01.2004,2004-01-01
10,30.11.1993,1993-11-30,31.12.1993,1993-12-31


Removing currency symbols and whitespace, then converting data to numeric format to allow arithmetic operations on financial data like box office and budget.

In [34]:
def money_to_float(money_str):
    if pd.isna(money_str):
        return np.nan
    # Remove $, spaces and optionally commas 
    cleaned = str(money_str).replace('$', '').replace(' ', '').replace(',', '')
    try:
        return float(cleaned)
    except ValueError:
        return np.nan

filmweb_df['boxoffice_in$'] = filmweb_df['boxoffice'].apply(money_to_float)
filmweb_df['boxoffice_usa_in$'] = filmweb_df['boxoffice_usa'].apply(money_to_float)
filmweb_df['boxoffice_outside_usa_in$'] = filmweb_df['boxoffice_outside_usa'].apply(money_to_float)
filmweb_df['budget_in$'] = filmweb_df['budget'].apply(money_to_float)


In [39]:
filmweb_df[['boxoffice','boxoffice_in$', 'boxoffice_usa','boxoffice_usa_in$', 'boxoffice_outside_usa','boxoffice_outside_usa_in$', 'budget','budget_in$']].head()

Unnamed: 0_level_0,boxoffice,boxoffice_in$,boxoffice_usa,boxoffice_usa_in$,boxoffice_outside_usa,boxoffice_outside_usa_in$,budget,budget_in$
position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,$28 884 504,28884504.0,$28 767 189,28767189.0,$117 315,117315.0,$25 000 000,25000000.0
2,$286 801 374,286801374.0,$136 801 374,136801374.0,$150 000 000,150000000.0,$60 000 000,60000000.0
3,$426 588 510,426588510.0,$10 198 820,10198820.0,$416 389 690,416389690.0,$10 800 000,10800000.0
4,$250 341 816,250341816.0,$136 381 073,136381073.0,$113 960 743,113960743.0,$6 000 000,6000000.0
5,$337 000,337000.0,,,,,,


In [42]:
# Saving transformed csv file
filmweb_df.to_csv('top500_transformed.csv')