In [51]:
from glob import glob
import pandas as pd
from sklearn.model_selection import train_test_split

lst_csv = glob('../tmdb-box-office-prediction/*.csv')
pd.set_option('display.max_columns',None)

In [52]:
test = pd.read_csv(f'{lst_csv[0]}')
train = pd.read_csv(f'{lst_csv[1]}')
sam_sub = pd.read_csv(f'{lst_csv[2]}')

In [55]:
train.shape, test.shape, sam_sub.shape

((3000, 23), (4398, 22), (4398, 2))

In [3]:
y = train['revenue']
del train['revenue']

In [6]:
sam_sub.head()

Unnamed: 0,id,revenue
0,3001,1000000
1,3002,1000000
2,3003,1000000
3,3004,1000000
4,3005,1000000


In [57]:
train_cat_features = train.select_dtypes(include=['object']).columns.to_list()
train_num_features = train.select_dtypes(exclude='object').columns.to_list()

test_cat_features = test.select_dtypes(include=['object']).columns.to_list()
test_num_features = test.select_dtypes(exclude='object').columns.to_list()

In [14]:
train[train_num_features] = train[train_num_features].fillna(0)
test[test_num_features] = test[test_num_features].fillna(0)

In [12]:
from sklearn import tree

clf = tree.DecisionTreeRegressor()
clf = clf.fit(train[train_num_features], y)

In [60]:
clf

In [15]:
pred = clf.predict(test[test_num_features])

In [62]:
len(pred)

4398

In [16]:
sam_sub['revenue'] = pred

In [63]:
sam_sub.query('revenue < 0')

Unnamed: 0,id,revenue


- data preprocessing

In [67]:
test = pd.read_csv(f'{lst_csv[0]}')
train = pd.read_csv(f'{lst_csv[1]}')
sam_sub = pd.read_csv(f'{lst_csv[2]}')

In [68]:
train.shape, test.shape

((3000, 23), (4398, 22))

In [69]:
train.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew', 'revenue'],
      dtype='object')

In [70]:
test.columns

Index(['id', 'belongs_to_collection', 'budget', 'genres', 'homepage',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'runtime', 'spoken_languages',
       'status', 'tagline', 'title', 'Keywords', 'cast', 'crew'],
      dtype='object')

- train과 test의 차이 : revenue

In [71]:
train.isnull().sum()

id                          0
belongs_to_collection    2396
budget                      0
genres                      7
homepage                 2054
imdb_id                     0
original_language           0
original_title              0
overview                    8
popularity                  0
poster_path                 1
production_companies      156
production_countries       55
release_date                0
runtime                     2
spoken_languages           20
status                      0
tagline                   597
title                       0
Keywords                  276
cast                       13
crew                       16
revenue                     0
dtype: int64

In [72]:
test.isnull().sum()

id                          0
belongs_to_collection    3521
budget                      0
genres                     16
homepage                 2978
imdb_id                     0
original_language           0
original_title              0
overview                   14
popularity                  0
poster_path                 1
production_companies      258
production_countries      102
release_date                1
runtime                     4
spoken_languages           42
status                      2
tagline                   863
title                       3
Keywords                  393
cast                       13
crew                       22
dtype: int64

In [73]:
target = train['revenue']
del train['revenue']

In [74]:
data = pd.concat([train,test])
data.shape

(7398, 22)

In [75]:
data.isnull().sum()

id                          0
belongs_to_collection    5917
budget                      0
genres                     23
homepage                 5032
imdb_id                     0
original_language           0
original_title              0
overview                   22
popularity                  0
poster_path                 2
production_companies      414
production_countries      157
release_date                1
runtime                     6
spoken_languages           62
status                      2
tagline                  1460
title                       3
Keywords                  669
cast                       26
crew                       38
dtype: int64

### data preprocessing 경우 train set test set 합쳐서 진행하는 것이 편리함.

- 1. imdb_id
    - Nan value : 0
    - But, unique value = unique id

In [76]:
data['id'].nunique(),data['imdb_id'].nunique()

(7398, 7398)

- 개인 사견 : id 당 imdb_id가 다 다르기 때문에 revenue 예측에 도움이 안될 것으로 판단.
    - But, 언제 다시 사용할 지 모르기 때문에 데이터는 변수에 저장해 살려놓는게 좋지 않을까 판단.

In [77]:
imdb_id = data['imdb_id']
del data['imdb_id']

- train 3000
- test : others

In [78]:
data.reset_index(drop=True,inplace=True)

In [79]:
data.index

RangeIndex(start=0, stop=7398, step=1)

## Dict data preprocessing

In [80]:
dict_columns = ['belongs_to_collection', 'genres', 'production_companies',
                'production_countries', 'spoken_languages', 'Keywords', 'cast', 'crew']

In [83]:
data['belongs_to_collection'][0]

"[{'id': 313576, 'name': 'Hot Tub Time Machine Collection', 'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg', 'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]"

In [85]:
import ast

ast.literal_eval('[21 , 24]')

[21, 24]

In [86]:
def text_to_dict(df):
    for column in dict_columns:
        df[column] = df[column].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x))
    return df

In [87]:
text_to_dict(data)

Unnamed: 0,id,belongs_to_collection,budget,genres,homepage,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,runtime,spoken_languages,status,tagline,title,Keywords,cast,crew
0,1,"[{'id': 313576, 'name': 'Hot Tub Time Machine ...",14000000,"[{'id': 35, 'name': 'Comedy'}]",,en,Hot Tub Time Machine 2,"When Lou, who has become the ""father of the In...",6.575393,/tQtWuwvMf0hCc2QR2tkolwl7c3c.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/20/15,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Laws of Space and Time are About to be Vio...,Hot Tub Time Machine 2,"[{'id': 4379, 'name': 'time travel'}, {'id': 9...","[{'cast_id': 4, 'character': 'Lou', 'credit_id...","[{'credit_id': '59ac067c92514107af02c8c8', 'de..."
1,2,"[{'id': 107674, 'name': 'The Princess Diaries ...",40000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,en,The Princess Diaries 2: Royal Engagement,Mia Thermopolis is now a college graduate and ...,8.248895,/w9Z7A0GHEhIp7etpj0vyKOeU1Wx.jpg,"[{'name': 'Walt Disney Pictures', 'id': 2}]","[{'iso_3166_1': 'US', 'name': 'United States o...",8/6/04,113.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It can take a lifetime to find true love; she'...,The Princess Diaries 2: Royal Engagement,"[{'id': 2505, 'name': 'coronation'}, {'id': 42...","[{'cast_id': 1, 'character': 'Mia Thermopolis'...","[{'credit_id': '52fe43fe9251416c7502563d', 'de..."
2,3,{},3300000,"[{'id': 18, 'name': 'Drama'}]",http://sonyclassics.com/whiplash/,en,Whiplash,"Under the direction of a ruthless instructor, ...",64.299990,/lIv1QinFqz4dlp5U4lQ6HaiskOZ.jpg,"[{'name': 'Bold Films', 'id': 2266}, {'name': ...","[{'iso_3166_1': 'US', 'name': 'United States o...",10/10/14,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,"[{'id': 1416, 'name': 'jazz'}, {'id': 1523, 'n...","[{'cast_id': 5, 'character': 'Andrew Neimann',...","[{'credit_id': '54d5356ec3a3683ba0000039', 'de..."
3,4,{},1200000,"[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'n...",http://kahaanithefilm.com/,hi,Kahaani,Vidya Bagchi (Vidya Balan) arrives in Kolkata ...,3.174936,/aTXRaPrWSinhcmCrcfJK17urp3F.jpg,{},"[{'iso_3166_1': 'IN', 'name': 'India'}]",3/9/12,122.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Kahaani,"[{'id': 10092, 'name': 'mystery'}, {'id': 1054...","[{'cast_id': 1, 'character': 'Vidya Bagchi', '...","[{'credit_id': '52fe48779251416c9108d6eb', 'de..."
4,5,{},0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,ko,마린보이,Marine Boy is the story of a former national s...,1.148070,/m22s7zvkVFDU9ir56PiiqIEWFdT.jpg,{},"[{'iso_3166_1': 'KR', 'name': 'South Korea'}]",2/5/09,118.0,"[{'iso_639_1': 'ko', 'name': '한국어/조선말'}]",Released,,Marine Boy,{},"[{'cast_id': 3, 'character': 'Chun-soo', 'cred...","[{'credit_id': '52fe464b9251416c75073b43', 'de..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7393,7394,{},42000000,"[{'id': 53, 'name': 'Thriller'}]",,en,Original Sin,A young man is plunged into a life of subterfu...,9.970359,/i8FEQy5IWAqOzXm4uDHy2r3Swym.jpg,"[{'name': 'Intermedia Films', 'id': 763}, {'na...","[{'iso_3166_1': 'FR', 'name': 'France'}, {'iso...",8/3/01,118.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,This is not a love story - it's a story about ...,Original Sin,"[{'id': 515, 'name': 'women'}, {'id': 572, 'na...","[{'cast_id': 17, 'character': 'Julia Russell/B...","[{'credit_id': '52fe4330c3a36847f80412db', 'de..."
7394,7395,"[{'id': 146534, 'name': 'Without a Paddle Coll...",19000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,en,Without a Paddle,"Three friends, whose lives have been drifting ...",6.046516,/oZDbFtTnTwW5GSfyaGFGaYxDBgD.jpg,"[{'name': 'Paramount Pictures', 'id': 4}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",8/20/04,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The call of the wild, the thrill of adventure....",Without a Paddle,"[{'id': 4959, 'name': 'death of a friend'}, {'...","[{'cast_id': 40, 'character': 'Dan Mott', 'cre...","[{'credit_id': '52fe43b29251416c7501a909', 'de..."
7395,7396,{},16000000,"[{'id': 18, 'name': 'Drama'}]",,en,The Verdict,"Frank Galvin is a down-on-his luck lawyer, red...",9.596883,/hh9sIE1PT7Pjq3n2fzHNEHh8Ogq.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",12/8/82,129.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The doctors want to settle, the Church wants t...",The Verdict,"[{'id': 1680, 'name': 'boston'}, {'id': 6148, ...","[{'cast_id': 1, 'character': 'Frank Galvin', '...","[{'credit_id': '52fe448bc3a368484e028c55', 'de..."
7396,7397,{},2000000,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,en,It Follows,"For 19-year-old Jay, fall should be about scho...",20.359336,/4MrwJZr0R9LbyOgZqwLNmtzzxbu.jpg,"[{'name': 'Northern Lights Films', 'id': 8714}...","[{'iso_3166_1': 'US', 'name': 'United States o...",2/4/15,100.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"It doesn't think, it doesn't feel, it doesn't ...",It Follows,"[{'id': 3713, 'name': 'chase'}, {'id': 6152, '...","[{'cast_id': 1, 'character': 'Jay Height', 'cr...","[{'credit_id': '537770b20e0a261431002299', 'de..."


### belongs_to_collection

In [88]:
data['belongs_to_collection'][0]

[{'id': 313576,
  'name': 'Hot Tub Time Machine Collection',
  'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg',
  'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]

- variable 1 : len(belongs_to_collection)
- variable 2 : use useful features 

In [47]:
data['len_belongs'] = data['belongs_to_collection'].apply(lambda x: {} if pd.isna(x) else len(x))

In [49]:
data['len_belongs'].value_counts()

0    5917
1    1481
Name: len_belongs, dtype: int64

In [50]:
data['belongs_to_collection'][0]

[{'id': 313576,
  'name': 'Hot Tub Time Machine Collection',
  'poster_path': '/iEhb00TGPucF0b4joM1ieyY026U.jpg',
  'backdrop_path': '/noeTVcgpBiD48fDjFVic1Vz7ope.jpg'}]

useful features : name

In [None]:
data['belongs_name'] = ''
for idx in range(len(data['belongs_to_collection'])):
    data['belongs_name'][idx] = data['belongs_to_collection'][idx][0]['name']

In [98]:
data['belongs_to_collection'][0][0]['name']

'Hot Tub Time Machine Collection'

In [99]:
pd.isna(data['belongs_to_collection'][0][0])

False

In [105]:
data['belongs_name'] = data['belongs_to_collection'].apply(lambda x: x[0]['name'] if x != {} else '')

In [108]:
data['len_belongs'] = data['belongs_to_collection'].apply(lambda x: len(x))

In [107]:
data['belongs_name'].value_counts()

                                 5917
James Bond Collection              26
Friday the 13th Collection         12
Pokémon Collection                  9
Star Wars Collection                8
                                 ... 
The Little Mermaid Collection       1
Wrong Turn Collection               1
Super Troopers Collection           1
Infernal Affairs Collection         1
Without a Paddle Collection         1
Name: belongs_name, Length: 751, dtype: int64

In [110]:
data['len_belongs'].value_counts()

0    5917
1    1481
Name: len_belongs, dtype: int64

In [111]:
pip install -U pandas-profiling

Collecting pandas-profiling
  Downloading pandas_profiling-3.6.6-py2.py3-none-any.whl (324 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m324.4/324.4 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting ydata-profiling
  Downloading ydata_profiling-4.0.0-py2.py3-none-any.whl (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pydantic<1.11,>=1.8.1
  Downloading pydantic-1.10.4-cp310-cp310-macosx_11_0_arm64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting seaborn<0.13,>=0.10.1
  Using cached seaborn-0.12.2-py3-none-any.whl (293 kB)
Collecting phik<0.13,>=0.11.1
  Downloading phik-0.12.3-cp310-cp310-macosx_11_0_arm64.whl (649 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m649.9/649.9 kB[0m [31m4.7

In [113]:
import pandas as pd
import pandas_profiling

data.profile_report()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

