In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Data/merged_movies_data.csv')

In [3]:
# 이상치 및 결측치 확인
# 상영시간이 잘못 들어간 값 확인
# df[df['Duration'].str.contains('h|m') == False]
# 값이 모두 0인 컬럼 확인
# df['wins'].value_counts()

In [5]:
# Duration 중에 h와 m이 존재하지 않는 row 확인
# Duration 에 MPA 데이터가 들어있는 row 확인 -> MPA로 변경필요 및 기존 Druation은 NaN으로 변경
for i in df[df['Duration'].str.contains('h|m') == False].index:
    df.loc[i, 'MPA'] = df.loc[i, 'Duration']
    df.loc[i, 'Duration'] = np.nan

In [6]:
# 데이터 형식 변경
# Duration 컬럼의 데이터 형식을 변경
def convert_duration_to_minutes(duration):
    if pd.isna(duration):  # NaN 체크
        return duration  # NaN 값 pass
    hours = 0
    minutes = 0

    # 시간 값 추출 (예: '2h')
    if 'h' in duration:
        hours = int(duration.split('h')[0].strip())

    # 분 값 추출 (예: '22m')
    if 'm' in duration:
        minutes_part = duration.split('h')[-1].strip()  # '22m' 추출
        minutes = int(minutes_part.replace('m', '').strip())

    # 총 분으로 변환
    return hours * 60 + minutes

# Duration 컬럼 변환
df['Duration'] = df['Duration'].apply(convert_duration_to_minutes)

In [7]:
# Votes 데이터 변환
# votes 값을 숫자로 변환하는 함수 (1.4k -> 1400)
def convert_votes(votes_value):
    if pd.isna(votes_value):  # NaN 값 처리
        return np.nan
    elif 'K' in votes_value:  # 'K'가 있는 경우
        return float(votes_value.replace('K', '')) * 1000
    elif 'M' in votes_value:  # 'M'이 있는 경우
        return float(votes_value.replace('M', '')) * 1000000
    else:  # 기타 값
        return float(votes_value)

# 함수 적용 
df['Votes'] = df['Votes'].apply(convert_votes)


In [18]:
# 불필요 컬럼 삭제
df = df.drop('Movie Link', axis=1) # Movie Link 컬럼 삭제 (분석에 불필요)
df = df.drop('release_date', axis=1) # release_date 컬럼 삭제 (year 컬럼과 중복
df = df.drop('wins', axis=1) # 값이 모두 0인 컬럼 삭제
df.info()

KeyError: "['Movie Link'] not found in axis"

In [9]:
# 결측치 대체
# 수익 관련 데이터 결측치 처리
df['grossWorldWide'] = df['grossWorldWide'].fillna(0)
df['gross_US_Canada'] = df['gross_US_Canada'].fillna(0)
df['opening_weekend_Gross'] = df['opening_weekend_Gross'].fillna(0)

# 등급 관련 데이터 결측치 처리
df['MPA'] = df['MPA'].fillna("Not Rated")

# 평점 관련 데이터 결측치 처리
df['Rating'] = df['Rating'].fillna(0)
df['Votes'] = df['Votes'].fillna(0)

# 상영시간 관련 데이터 결측치 처리
df['Duration'] = df['Duration'].fillna(df['Duration'].mean())

# 빈 배열인 값 (결측치) 처리
df['directors'] = df['directors'].replace('[]', "['Unknown Director']")
df['writers'] = df['writers'].replace('[]', "['Unknown Writer']")
df['stars'] = df['stars'].replace('[]', "['Unknown Cast']")
df['countries_origin'] = df['countries_origin'].replace('[]', "['Unknown Country']")
df['filming_locations'] = df['filming_locations'].replace('[]', "['Unknown Location']")
df['production_companies'] = df['production_companies'].replace('[]', "['Unknown Studio']")
df['Languages'] = df['Languages'].replace('[]', "['Unknown Language']")

In [11]:
df.budget.isnull().sum()

np.int64(5951)

In [12]:
# 장르별 예산이 없는 데이터의 경우 동일한 장르의 평균 예산으로 대체
df['budget'] = df['budget'].fillna(df.groupby('genres')['budget'].transform('mean'))
df.budget.isnull().sum()

np.int64(1918)

In [13]:
# 예산이 없는 데이터의 인덱스 추출
# indices = df[df['budget'].isnull()].index
# display(df[df.budget.isnull()])

In [14]:
genre_groups = {
    "Epic": ["Action Epic", "Fantasy Epic", "Adventure Epic", "War Epic", "Historical Epic", "Western Epic", "Romantic Epic"],
    "Fantasy": ["Fantasy", "Supernatural Fantasy", "Dark Fantasy", "Sword & Sorcery"],
    "Action": ["Action", "Car Action", "One-Person Army Action", "Martial Arts", "Gun Fu"],
    "Comedy": ["Comedy", "High-Concept Comedy", "Dark Comedy", "Slapstick", "Quirky Comedy", "Teen Comedy", "Raunchy Comedy", "Parody", "Screwball Comedy", "Stoner Comedy", "Body Swap Comedy"],
    "Horror": ["Horror", "Splatter Horror", "Slasher Horror", "Zombie Horror", "Monster Horror", "Vampire Horror", "Werewolf Horror", "Witch Horror", "Psychological Horror", "Body Horror", "Found Footage Horror", "Folk Horror", "B-Horror", "Kaiju"],
    "Romance": ["Romance", "Feel-Good Romance", "Holiday Romance", "Steamy Romance", "Tragic Romance", "Teen Romance", "Holiday Family", "Romantic Comedy"],
    "Adventure": ["Adventure", "Jungle Adventure", "Mountain Adventure", "Sea Adventure", "Desert Adventure", "Urban Adventure", "Globetrotting Adventure", "Teen Adventure"],
    "Drama": ["Drama", "Period Drama", "Legal Drama", "Crime Drama", "Medical Drama", "Political Drama", "Workplace Drama", "Teen Drama", "Psychological Drama", "Historical Drama", "Showbiz Drama", "Family Drama"]
}

# 장르를 카테고리로 묶는 함수 정의
def categorize_genre(genre: str):
    for category, genres in genre_groups.items():
        for g in genres:
            if g in genre:
                return category
        if category in genre:
            return category
    return "Other"

# 카테고리 컬럼 추가 및 장르별 카테고리 설정
df['Category'] = df['genres'].apply(categorize_genre)

# 카테고리별 예산이 없는 데이터의 확인
# display(df[df.budget.isnull()])
# df.budget.isnull().sum()

In [15]:
## 카테고리별 예산 평균값을 구하고 null 값에 대입하기
df['budget'] = df['budget'].fillna(df['Category'].map(df.groupby('Category')['budget'].mean()))

In [16]:
# indices 값이 잘 들어갔는지 확인
# df.loc[indices]

In [17]:
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10700 entries, 0 to 10699
Data columns (total 21 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Title                  10700 non-null  object 
 1   Year                   10700 non-null  int64  
 2   Duration               10700 non-null  float64
 3   MPA                    10700 non-null  object 
 4   Rating                 10700 non-null  float64
 5   Votes                  10700 non-null  float64
 6   budget                 10700 non-null  float64
 7   grossWorldWide         10700 non-null  float64
 8   gross_US_Canada        10700 non-null  float64
 9   opening_weekend_Gross  10700 non-null  float64
 10  directors              10700 non-null  object 
 11  writers                10700 non-null  object 
 12  stars                  10700 non-null  object 
 13  genres                 10700 non-null  object 
 14  countries_origin       10700 non-null  object 
 15  fi

Unnamed: 0,Year,Duration,Rating,Votes,budget,grossWorldWide,gross_US_Canada,opening_weekend_Gross,nominations,oscars
count,10700.0,10700.0,10700.0,10700.0,10700.0,10700.0,10700.0,10700.0,10700.0,10700.0
mean,2014.654766,106.090278,6.347729,54264.78,130307700.0,42863240.0,16369460.0,4950719.0,10.948598,0.099252
std,5.828472,20.382064,1.129014,137012.1,3037201000.0,141310200.0,53187280.0,16772510.0,28.736221,0.500172
min,2005.0,44.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2009.0,92.0,5.8,1400.0,3670882.0,70064.5,26645.5,4458.25,0.0,0.0
50%,2015.0,102.0,6.5,7900.0,14822670.0,1200102.0,204235.0,29400.0,0.0,0.0
75%,2020.0,116.0,7.1,42000.0,40000000.0,18303320.0,4197936.0,680274.2,10.0,0.0
max,2024.0,325.0,9.6,3000000.0,300000000000.0,2923706000.0,936662200.0,357115000.0,433.0,10.0
