In [1]:
import pandas as pd
from ast import literal_eval
from sklearn.model_selection import train_test_split

# read from file
data_frame = pd.read_csv("../ProjectResources/movies/movies_metadata.csv", low_memory=False)

#create working copy
work_copy = data_frame.copy()
work_copy['budget'] = pd.to_numeric(work_copy['budget'], errors='coerce')

# clean the data
work_copy.dropna(axis='index', how='any', subset=['budget','vote_count','vote_average', 'revenue', 'genres'], inplace=True)
work_copy = work_copy[['budget','genres','vote_count', 'vote_average', 'revenue']]
# drop data with bad values
work_copy = work_copy.drop(work_copy[work_copy.revenue == 0].index)
work_copy = work_copy.drop(work_copy[work_copy.budget == 0].index)
work_copy = work_copy.drop(work_copy[work_copy.vote_count == 0].index)

# object under genres can have multiple values, expand the dataset by creating multiple rows for the
# same movie but with a single genre. These single genre items will be in {id: #, name: "genre_name"} format
work_copy['genres'] = work_copy['genres'].apply(literal_eval)
work_copy = work_copy.explode("genres")
work_copy.dropna(axis='index', how='any', subset=['genres'], inplace=True)
work_copy['genres'] = [d.get('name') for d in work_copy['genres']]

# Encode the genres into integers.
from sklearn.preprocessing import LabelEncoder
work_copy['genre_int'] = LabelEncoder().fit_transform(work_copy['genres'])

# create training and test sets
train_set, test_set = train_test_split(work_copy, test_size=0.2, random_state=27)


In [4]:
data_frame.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [2]:
work_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14000 entries, 0 to 45422
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   budget        14000 non-null  float64
 1   genres        14000 non-null  object 
 2   vote_count    14000 non-null  float64
 3   vote_average  14000 non-null  float64
 4   revenue       14000 non-null  float64
 5   genre_int     14000 non-null  int32  
dtypes: float64(4), int32(1), object(1)
memory usage: 710.9+ KB


In [3]:
work_copy["genre_int"].value_counts()

genre_int
6     2584
3     1850
17    1504
0     1414
14    1015
1      960
4      862
15     635
11     586
7      532
8      511
13     443
2      293
10     235
18     204
12     192
19      89
5       58
9       32
16       1
Name: count, dtype: int64