# 1. ENVIRONMENT & DATA LOADING

In [1]:
import pandas as pd
import numpy as np
import ast
import matplotlib.pyplot as plt
import plotly
import plotly.offline as py
import seaborn as sns
import datetime

import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', 50)

In [2]:
df = pd.read_csv('../data/movies_metadata.csv')
df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [3]:
df.shape

(45466, 24)

In [4]:
credit_df = pd.read_csv('../data/credits.csv')
credit_df.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


# 2. DATA CLEANING & WRANGLING

# 2.1. Chuyển id từ string sang integer. Sau đó join movies_metadata với credits dựa trên id.

In [5]:
def convert_int(x):
    try:
        return int(x)
    except:
        return np.nan

In [6]:
df['id'] = df['id'].apply(convert_int)

In [7]:
df = df.merge(credit_df, on='id')

In [8]:
df.head(10)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,cast,crew
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862.0,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de..."
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844.0,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de..."
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602.0,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de..."
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357.0,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de..."
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862.0,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de..."
5,False,,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",,949.0,tt0113277,en,Heat,"Obsessive master thief, Neil McCauley leads a ...",...,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,A Los Angeles Crime Saga,Heat,False,7.7,1886.0,"[{'cast_id': 25, 'character': 'Lt. Vincent Han...","[{'credit_id': '52fe4292c3a36847f802916d', 'de..."
6,False,,58000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,11860.0,tt0114319,en,Sabrina,An ugly duckling having undergone a remarkable...,...,127.0,"[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Released,You are cordially invited to the most surprisi...,Sabrina,False,6.2,141.0,"[{'cast_id': 1, 'character': 'Linus Larrabee',...","[{'credit_id': '52fe44959251416c75039da9', 'de..."
7,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,45325.0,tt0112302,en,Tom and Huck,"A mischievous young boy, Tom Sawyer, witnesses...",...,97.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The Original Bad Boys.,Tom and Huck,False,5.4,45.0,"[{'cast_id': 2, 'character': 'Tom Sawyer', 'cr...","[{'credit_id': '52fe46bdc3a36847f810f797', 'de..."
8,False,,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9091.0,tt0114576,en,Sudden Death,International action superstar Jean Claude Van...,...,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Terror goes into overtime.,Sudden Death,False,5.5,174.0,"[{'cast_id': 1, 'character': 'Darren Francis T...","[{'credit_id': '52fe44dbc3a36847f80ae0f1', 'de..."
9,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710.0,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0,"[{'cast_id': 1, 'character': 'James Bond', 'cr...","[{'credit_id': '52fe426ec3a36847f801e14b', 'de..."


In [9]:
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'cast', 'crew'],
      dtype='object')

**Mô tả cột dữ liệu sau khi join**
- adult: Phim dành cho người lớn
- belongs_to_collection: Bộ sưu tập phim
- budget: Ngân sách
- genres: Thể loại phim
- homepage: Trang chủ
- id: Mã định danh
- imdb_id: Mã định danh trên IMDB
- original_language: Ngôn ngữ gốc
- original_title: Tựa đề gốc
- overview: Tổng quan
- popularity: Độ phổ biến
- poster_path: Đường dẫn ảnh bìa
- production_companies: Công ty sản xuất
- production_countries: Quốc gia sản xuất
- release_date: Ngày phát hành
- revenue: Doanh thu
- runtime: Thời lượng
- spoken_languages: Ngôn ngữ sử dụng
- status: Trạng thái
- tagline: Khẩu hiệu
- title: Tựa đề
- video: Video
- vote_average: Điểm đánh giá trung bình
- vote_count: Số lượt đánh giá
- cast: Diễn viên
- crew: Đội ngũ sản xuất

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45538 entries, 0 to 45537
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45538 non-null  object 
 1   belongs_to_collection  4500 non-null   object 
 2   budget                 45538 non-null  object 
 3   genres                 45538 non-null  object 
 4   homepage               7792 non-null   object 
 5   id                     45538 non-null  float64
 6   imdb_id                45521 non-null  object 
 7   original_language      45527 non-null  object 
 8   original_title         45538 non-null  object 
 9   overview               44584 non-null  object 
 10  popularity             45535 non-null  object 
 11  poster_path            45152 non-null  object 
 12  production_companies   45535 non-null  object 
 13  production_countries   45535 non-null  object 
 14  release_date           45451 non-null  object 
 15  re

# 2.2. Bỏ các cột không cần thiết để phân tích và xây dựng mô hình đề xuất phim.

**Bỏ đi các cột không cần thiết:
imdb_id, original_title**

In [11]:
df.drop(['imdb_id', 'original_title'], axis=1, inplace=True)

**Bỏ đi các phim adult vì không phù hợp với phân tích**

In [12]:
index_to_drop = df[df['adult']=='True'].index
index_to_drop

Index([19525, 28763, 31998, 32177, 39969, 39970, 40644, 41079, 43159], dtype='int64')

In [13]:
df.drop(index_to_drop, inplace=True)

# 2.3. Xử lý các giá trị thiếu trong các cột quan trọng: budget, revenue, runtime, vote_average

**Kiểm tra và xử lý cột doanh thu (revenue)**

In [14]:
df['revenue'] = pd.to_numeric(df['revenue'], errors='coerce')

In [15]:
df['revenue'].isna().sum()

np.int64(3)

In [16]:
(df['revenue'] == 0 ).sum()

np.int64(38098)

**Cột doanh thu (revenue) có 38107 giá trị là 0 và có 3 giá trị là NaN. Ta sẽ thay thế giá trị 0 bằng NaN.**

In [17]:
df['revenue'].replace(0, np.nan, inplace=True)
df[df['revenue'].isnull()].shape

(38101, 24)

**Thực hiện chuyển đổi budget sang numeric và thay thế các dòng giá trị 0 bằng NaN trong cột kinh phí (budget) tương tự với revenue.**

In [18]:
# adding the error='coerce' as there are are some unparsed data
df['budget'] = pd.to_numeric(df['budget'], errors='coerce')

# replace all the zeros to NaN
df['budget'].replace(0, np.nan, inplace= True)
df[df['budget'].isnull()].shape

(36618, 24)

*Lấp đầy các giá trị thiếu trong cột runtime và vote_average bằng mean của cột này.*

In [19]:
df['runtime'].isna().sum()

np.int64(260)

In [20]:
df['vote_average'].isnull().sum()

np.int64(3)

In [21]:
df['runtime'] = df['runtime'].fillna(df['runtime'].mean())
df['vote_average'] = df['vote_average'].fillna(df['vote_average'].mean())

# 2.4.Tạo các cột mới để hỗ trợ phân tích dữ liệu

**Tạo cột mới tên là return được tính bằng tỉ lệ giữa revenue và budget, nếu return > 1 là có lãi, còn < 1 là lỗ.**

In [22]:
df['return'] = df['revenue'] / df['budget']
# check how many of them are null
df[df['return'].isnull()].shape

(40136, 25)

**Tạo các cột ngày, tháng, năm tách ra từ relase date**

In [23]:
month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
day_order = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']

In [24]:
# function to get the month and the day
def get_month(x):
    try:
        return month_order[int(str(x).split('-')[1]) - 1]
    except:
        return np.nan
    
def get_day(x):
    try:
        year, month, day = (int(i) for i in x.split('-'))    
        answer = datetime.date(year, month, day).weekday()
        return day_order[answer]
    except:
        return np.nan

In [25]:
# create features to get months and days
df['day'] = df['release_date'].apply(get_day)
df['month'] = df['release_date'].apply(get_month)

In [26]:
df['year'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: 
                                                                       str(x).split('-')[0] 
                                                                       if x != np.nan else np.nan)

# 3. Parsing dữ liệu từ các cột dạng JSON

In [27]:
df['production_companies'] = df['production_companies'].fillna('[]').apply(ast.literal_eval)
df['production_companies'] = df['production_companies'].apply(lambda x: [i['name'] for i in x] 
                                                              if isinstance(x, list) else [])
df['production_companies']

0                                [Pixar Animation Studios]
1        [TriStar Pictures, Teitler Film, Interscope Co...
2                           [Warner Bros., Lancaster Gate]
3                 [Twentieth Century Fox Film Corporation]
4             [Sandollar Productions, Touchstone Pictures]
                               ...                        
45533                                                   []
45534                                        [Sine Olivia]
45535                            [American World Pictures]
45536                                          [Yermoliev]
45537                                                   []
Name: production_companies, Length: 45529, dtype: object

In [28]:
# parse the dictionary data and create list of production countries 
df['production_countries'] = df['production_countries'].fillna('[]').apply(ast.literal_eval)
df['production_countries'] = df['production_countries'].apply(lambda x: [i['name'] for i in x] 
                                                              if isinstance(x, list) else [])
df['production_countries']

0        [United States of America]
1        [United States of America]
2        [United States of America]
3        [United States of America]
4        [United States of America]
                    ...            
45533                        [Iran]
45534                 [Philippines]
45535    [United States of America]
45536                      [Russia]
45537              [United Kingdom]
Name: production_countries, Length: 45529, dtype: object

In [29]:
df['cast'] = df['cast'].apply(ast.literal_eval)
df['cast']

0        [{'cast_id': 14, 'character': 'Woody (voice)',...
1        [{'cast_id': 1, 'character': 'Alan Parrish', '...
2        [{'cast_id': 2, 'character': 'Max Goldman', 'c...
3        [{'cast_id': 1, 'character': 'Savannah 'Vannah...
4        [{'cast_id': 1, 'character': 'George Banks', '...
                               ...                        
45533    [{'cast_id': 0, 'character': '', 'credit_id': ...
45534    [{'cast_id': 1002, 'character': 'Sister Angela...
45535    [{'cast_id': 6, 'character': 'Emily Shaw', 'cr...
45536    [{'cast_id': 2, 'character': '', 'credit_id': ...
45537                                                   []
Name: cast, Length: 45529, dtype: object

In [30]:
df['crew'] = df['crew'].apply(ast.literal_eval)
df['crew']

0        [{'credit_id': '52fe4284c3a36847f8024f49', 'de...
1        [{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...
2        [{'credit_id': '52fe466a9251416c75077a89', 'de...
3        [{'credit_id': '52fe44779251416c91011acb', 'de...
4        [{'credit_id': '52fe44959251416c75039ed7', 'de...
                               ...                        
45533    [{'credit_id': '5894a97d925141426c00818c', 'de...
45534    [{'credit_id': '52fe4af1c3a36847f81e9b15', 'de...
45535    [{'credit_id': '52fe4776c3a368484e0c8387', 'de...
45536    [{'credit_id': '533bccebc3a36844cf0011a7', 'de...
45537    [{'credit_id': '593e676c92514105b702e68e', 'de...
Name: crew, Length: 45529, dtype: object

In [31]:
df['cast_size'] = df['cast'].apply(lambda x: len(x))
df['crew_size'] = df['crew'].apply(lambda x: len(x))

In [32]:
df['cast'] = df['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
df['cast'].head()

0    [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...
1    [Robin Williams, Jonathan Hyde, Kirsten Dunst,...
2    [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...
3    [Whitney Houston, Angela Bassett, Loretta Devi...
4    [Steve Martin, Diane Keaton, Martin Short, Kim...
Name: cast, dtype: object

In [33]:
# now lets get the director name out of the crew column and create a new column as director
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
        return np.nan

In [34]:
df['director'] = df['crew'].apply(get_director)
df['director'].head()

0      John Lasseter
1                NaN
2      Howard Deutch
3    Forest Whitaker
4                NaN
Name: director, dtype: object

# 4. Khám phá dữ liệu (Exploratory Data Analysis - EDA)

# 5. Feature Engineering

In [35]:
def clean_numeric(x):
    try:
        return float(x)
    except:
        return np.nan

In [36]:
df['popularity'] = df['popularity'].apply(clean_numeric).astype('float')
df['vote_count'] = df['vote_count'].apply(clean_numeric).astype('float')
df['vote_average'] = df['vote_average'].apply(clean_numeric).astype('float')
df['year'] = df['year'].apply(clean_numeric).astype('float')

In [37]:
df['genres'] = df['genres'].fillna('[]').apply(ast.literal_eval).apply(lambda x: 
                                                                       [i['name'] for i in x] 
                                                                       if isinstance(x, list) else [])

# convert the dictionary form of spoken language to number of language it was released in
df['spoken_languages'] = df['spoken_languages'].fillna('[]').apply(ast.literal_eval).apply(lambda x: 
                                                                                           len(x) 
                                                                                           if isinstance(x, list)
                                                                                           else np.nan) 
     

In [38]:
df['genres']

0         [Animation, Comedy, Family]
1        [Adventure, Fantasy, Family]
2                   [Romance, Comedy]
3            [Comedy, Drama, Romance]
4                            [Comedy]
                     ...             
45533                 [Drama, Family]
45534                         [Drama]
45535       [Action, Drama, Thriller]
45536                              []
45537                              []
Name: genres, Length: 45529, dtype: object

In [39]:
df['spoken_languages']

0        1
1        2
2        1
3        1
4        1
        ..
45533    1
45534    1
45535    1
45536    0
45537    1
Name: spoken_languages, Length: 45529, dtype: int64

In [40]:
# we will remove some unwanted features
final_data = df.copy()
final_data = final_data.drop(['status', 'video', 'crew'], axis=1)

# create the genres dictionary to series for independent feature 
s = final_data.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
gen_final = final_data.drop('genres', axis=1).join(s)
gen_final = gen_final[~gen_final['genre'].isna()]
genres_final = gen_final['genre'].drop_duplicates()

In [41]:
genres_final.head(genres_final.shape[0])

0            Animation
0               Comedy
0               Family
1            Adventure
1              Fantasy
2              Romance
3                Drama
5               Action
5                Crime
5             Thriller
11              Horror
13             History
23     Science Fiction
31             Mystery
40                 War
52             Foreign
61               Music
76         Documentary
207            Western
306           TV Movie
Name: genre, dtype: object

In [42]:
def feature_engineering(data):
    data['belongs_to_collection'] = data['belongs_to_collection'].apply(lambda x: 0 if pd.isna(x) else 1)
    
    for genre in genres_final:
        data['is_' + str(genre)] = data['genres'].apply(lambda x: 1 if genre in x else 0)
    
    data['genres'] = data['genres'].apply(lambda x: len(x))
    
    data['homepage'] = data['homepage'].apply(lambda x: 0 if x == np.nan else 1)
    
    # 1 for english and 0 for all
    data['is_english'] = data['original_language'].apply(lambda x: 1 if x=='en' else 0)
    data = data.drop('original_language', axis=1)
    
    # populate the numbers for production companies and countries
    data['production_companies'] = data['production_companies'].apply(lambda x: len(x))
    data['production_countries'] = data['production_countries'].apply(lambda x: len(x))
    
    # set 1 for friday else 0
    data['is_Friday'] = data['day'].apply(lambda x: 1 if x=='Fri' else 0)
    data = data.drop('day', axis=1)
    
    # set 1 for holiday else 0
    data['is_Holiday'] = data['month'].apply(lambda x: 1 if x in ['Apr', 'May', 'Jun', 'Nov'] else 0)
    data = data.drop('month', axis=1)
    #data = data.drop(['cast', 'director'], axis=1)
    
    # create dummies
    #data = pd.get_dummies(data, prefix='is')
    
    # fill the na's with mean
    data['runtime'] = data['runtime'].fillna(data['runtime'].mean())
    data['vote_average'] = data['vote_average'].fillna(data['vote_average'].mean())
    
    # convert the id to integer
    data['id'] = data['id'].apply(convert_int)
    
    # drop adult column
    data = data.drop(['adult'], axis =1)
    
    return data

In [43]:
ft_data = feature_engineering(final_data)

In [44]:
ft_data.head()

Unnamed: 0,belongs_to_collection,budget,genres,homepage,id,overview,popularity,poster_path,production_companies,production_countries,...,is_Mystery,is_War,is_Foreign,is_Music,is_Documentary,is_Western,is_TV Movie,is_english,is_Friday,is_Holiday
0,1,30000000.0,3,1,862,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,1,1,...,0,0,0,0,0,0,0,1,0,0
1,0,65000000.0,3,1,8844,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,3,1,...,0,0,0,0,0,0,0,1,1,0
2,1,,2,1,15602,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,2,1,...,0,0,0,0,0,0,0,1,1,0
3,0,16000000.0,3,1,31357,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,1,1,...,0,0,0,0,0,0,0,1,1,0
4,1,,1,1,11862,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,2,1,...,0,0,0,0,0,0,0,1,1,0


In [45]:
ft_data.columns

Index(['belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'tagline', 'title', 'vote_average', 'vote_count',
       'cast', 'return', 'year', 'cast_size', 'crew_size', 'director',
       'is_Animation', 'is_Comedy', 'is_Family', 'is_Adventure', 'is_Fantasy',
       'is_Romance', 'is_Drama', 'is_Action', 'is_Crime', 'is_Thriller',
       'is_Horror', 'is_History', 'is_Science Fiction', 'is_Mystery', 'is_War',
       'is_Foreign', 'is_Music', 'is_Documentary', 'is_Western', 'is_TV Movie',
       'is_english', 'is_Friday', 'is_Holiday'],
      dtype='object')

In [46]:
ft_data.to_csv("../data/engineered_data.csv", index=False)