In [1]:
import numpy as np
import pandas as pd

# 1980 - 2025 데이터 합치기 & 전처리

In [2]:
def transform_duration_minutes(df):
    """
    1. 'Duration' 컬럼의 문자열로 되어있는 상영시간을 분석하여,
       'Duration_minute'라는 새 컬럼에 int타입으로 전체 상영시간 분(minute)으로 변환
    2. 'Duration' 컬럼 제거
    3. 결측치(NaN)는 중앙값(median)으로 대체
    """

    # helper function
    def parse_duration(duration):
        if not isinstance(duration, str) or duration.strip() == "":
            return np.nan
        hours, minutes = 0, 0

        if 'h' in duration:
            h_split = duration.split('h')
            hours = int(h_split[0].strip())
            if 'm' in h_split[1]:
                minutes = int(h_split[1].replace('m', '').strip())
        elif 'm' in duration:
            minutes = int(duration.replace('m', '').strip())
        
        return hours * 60 + minutes

    # 1. 변환
    df['Duration_minute'] = df['Duration'].apply(parse_duration)

    # 2. Drop original column
    df.drop(columns=['Duration'], inplace=True)

    # 3. 결측값 -> 중앙값으로 대체
    median_duration = df['Duration_minute'].median()
    df['Duration_minute'] = df['Duration_minute'].fillna(median_duration)

    return df


In [3]:
## 1980 - 2025 데이터 합치기

# 계속 합칠(업데이트 할) DataFrame 틀 만들기
movie_df = pd.DataFrame(columns=['Title', 'Year', 'MPA', 'budget', 'countries_origin', 'genres', 'Duration_minute'])
#display(movie_df)

years = list(range(1980, 2026))
for year in years:
    path = f'./movie_data/{year}/merged_movies_data_{year}.csv'
    temp_df = pd.read_csv(path)
    
    # 필요 없는 column 제거 (['Rating', 'Votes', 'méta_score', 'description', 'Movie Link', 'writers', 'directors', 'stars', 'opening_weekend_Gross', 'grossWorldWWide', 'gross_US_Canada', 'filming_locations', 'production_company', 'awards_content', 'Languages', 'release_date'])
    temp_df = temp_df.drop(['Rating','Votes', 'méta_score', 'description', 'Movie Link', 'writers', 'directors', 'stars', 'opening_weekend_Gross', 'grossWorldWWide', 'gross_US_Canada', 'filming_locations', 'production_company', 'awards_content', 'Languages', 'release_date'], axis=1)
    
    # Duration 변환: e.g. 2h 4m -> 124
    temp_df = transform_duration_minutes(temp_df)
    
    # MPA(상영등급) 전처리: 'Not Rated' 및 결측 상영등급 'Unrated'로 변환
    temp_df['MPA'] = temp_df['MPA'].fillna('Unrated')
    temp_df.loc[temp_df['MPA'] == 'Not Rated', 'MPA'] = 'Unrated'
    
    # countries_origin(상영국가) 결측치 처리: 'Unknown'으로 바꿈
    temp_df['countries_origin'] = temp_df['countries_origin'].fillna("['Unknown']")
    
    # 기존 dataframe이랑 병합
    movie_df = pd.concat([movie_df,temp_df], axis=0) # 행 기준 병합
    

  movie_df = pd.concat([movie_df,temp_df], axis=0) # 행 기준 병합


In [4]:
movie_df.to_csv('movie_1980_2025.csv', index=False)

In [5]:
display(movie_df[movie_df['countries_origin']=="['Unknown']"].head(2))
print(type(movie_df['countries_origin'].iloc[0]))


Unnamed: 0,Title,Year,MPA,budget,countries_origin,genres,Duration_minute
67,68. How to Become a Detective,1980,Unrated,,['Unknown'],[],97.0
338,339. Black Hawk Down,1997,Unrated,,['Unknown'],['Documentary'],57.0


<class 'str'>
