In [146]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from collections import defaultdict
from surprise.model_selection import GridSearchCV
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
import numpy as np
import ast
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [147]:
movies = pd.read_csv('new_movies_missing_vals.csv')
ratings = pd.read_csv('../data/ratings.csv')
links = pd.read_csv('../data/links.csv',dtype=str)
ratings['rating'] = ratings['rating']*2
def convert(id):
    try:
        return str(int(id))
    except:
        return id
movies['tmdbId'] = movies['tmdbId'].apply(convert)
links = links.drop_duplicates(subset='tmdbId', keep='first')
movies = movies.merge(links[['tmdbId','imdbId']], how='inner', on='tmdbId')

In [148]:
movies[['vote_average','vote_count']].head()

Unnamed: 0,vote_average,vote_count
0,7.969,18889.0
1,7.237,10783.0
2,6.5,398.0
3,6.3,173.0
4,6.237,754.0


In [149]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 29 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   movieId                9742 non-null   int64  
 1   tmdbId                 9734 non-null   object 
 2   adult                  9623 non-null   object 
 3   backdrop_path          9513 non-null   object 
 4   belongs_to_collection  2068 non-null   object 
 5   budget                 9623 non-null   float64
 6   genres                 9742 non-null   object 
 7   homepage               2639 non-null   object 
 8   id                     9623 non-null   float64
 9   imdb_id                9622 non-null   object 
 10  origin_country         9623 non-null   object 
 11  original_language      9623 non-null   object 
 12  original_title         9623 non-null   object 
 13  overview               9621 non-null   object 
 14  popularity             9623 non-null   float64
 15  post

In [150]:
missing = movies['vote_average'].isna()
missing

0       False
1       False
2       False
3       False
4       False
        ...  
9737    False
9738    False
9739    False
9740    False
9741    False
Name: vote_average, Length: 9742, dtype: bool

In [151]:
movies[missing][['vote_average','vote_count']]

Unnamed: 0,vote_average,vote_count
624,,
843,,
2141,,
3027,,
3127,,
...,...,...
9603,,
9634,,
9651,,
9690,,


In [152]:
movies[['vote_average','vote_count']].head()

Unnamed: 0,vote_average,vote_count
0,7.969,18889.0
1,7.237,10783.0
2,6.5,398.0
3,6.3,173.0
4,6.237,754.0


In [153]:
movies.isna().sum()

movieId                     0
tmdbId                      8
adult                     119
backdrop_path             229
belongs_to_collection    7674
budget                    119
genres                      0
homepage                 7103
id                        119
imdb_id                   120
origin_country            119
original_language         119
original_title            119
overview                  121
popularity                119
poster_path               124
production_companies      119
production_countries      119
release_date              119
revenue                   119
runtime                   119
spoken_languages          119
status                    119
tagline                  1440
title                       0
video                     119
vote_average              119
vote_count                119
imdbId                      0
dtype: int64

In [154]:
movies[['popularity','vote_count']].head(20)

Unnamed: 0,popularity,vote_count
0,21.4021,18889.0
1,3.0047,10783.0
2,1.4596,398.0
3,1.612,173.0
4,2.0869,754.0
5,15.2539,7646.0
6,4.3758,644.0
7,1.0475,197.0
8,2.1873,746.0
9,7.8439,4025.0


In [155]:
# Step 1: Select relevant numeric columns that correlate with popularity
features = ['popularity', 'vote_average', 'vote_count', 'runtime', 'budget', 'revenue']

# Step 2: Create a working subset
df_median = movies[features].copy()

# Step 3: Fill budget and revenue with zero
df_median[['budget', 'revenue']] = df_median[['budget', 'revenue']].fillna(0)

# Step 4: Fill the rest with median
for col in ['popularity', 'vote_average', 'vote_count', 'runtime']:
    df_median[col] = df_median[col].fillna(df_median[col].median())

# Final imputed DataFrame
imputed_df = df_median


In [156]:
movies[features][missing].head(15)

Unnamed: 0,popularity,vote_average,vote_count,runtime,budget,revenue
624,,,,,,
843,,,,,,
2141,,,,,,
3027,,,,,,
3127,,,,,,
3362,,,,,,
3680,,,,,,
3741,,,,,,
4981,,,,,,
4986,,,,,,


In [157]:
for col in features:
    movies.loc[movies[col].isna(), col] = imputed_df.loc[movies[col].isna(), col]

In [158]:
movies[['imdbId']+features][missing].head(25)

Unnamed: 0,imdbId,popularity,vote_average,vote_count,runtime,budget,revenue
624,113610,1.8512,6.569,471.0,102.0,0.0,0.0
843,113610,1.8512,6.569,471.0,102.0,0.0,0.0
2141,113610,1.8512,6.569,471.0,102.0,0.0,0.0
3027,113610,1.8512,6.569,471.0,102.0,0.0,0.0
3127,100232,1.8512,6.569,471.0,102.0,0.0,0.0
3362,96913,1.8512,6.569,471.0,102.0,0.0,0.0
3680,270933,1.8512,6.569,471.0,102.0,0.0,0.0
3741,81809,1.8512,6.569,471.0,102.0,0.0,0.0
4981,259153,1.8512,6.569,471.0,102.0,0.0,0.0
4986,112130,1.8512,6.569,471.0,102.0,0.0,0.0


In [159]:
movies.isna().sum()

movieId                     0
tmdbId                      8
adult                     119
backdrop_path             229
belongs_to_collection    7674
budget                      0
genres                      0
homepage                 7103
id                        119
imdb_id                   120
origin_country            119
original_language         119
original_title            119
overview                  121
popularity                  0
poster_path               124
production_companies      119
production_countries      119
release_date              119
revenue                     0
runtime                     0
spoken_languages          119
status                    119
tagline                  1440
title                       0
video                     119
vote_average                0
vote_count                  0
imdbId                      0
dtype: int64

In [160]:
movies.to_csv('new_movies.csv', index=False)

In [161]:
cols_of_interset = ['movieId','genres', 'popularity', 'runtime', 'vote_average', 'vote_count']
movie_dataset = movies[cols_of_interset].copy()
movie_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movieId       9742 non-null   int64  
 1   genres        9742 non-null   object 
 2   popularity    9742 non-null   float64
 3   runtime       9742 non-null   float64
 4   vote_average  9742 non-null   float64
 5   vote_count    9742 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 456.8+ KB


In [162]:
movie_dataset.head()

Unnamed: 0,movieId,genres,popularity,runtime,vote_average,vote_count
0,1,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",21.4021,81.0,7.969,18889.0
1,2,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",3.0047,104.0,7.237,10783.0
2,3,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1.4596,101.0,6.5,398.0
3,4,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1.612,127.0,6.3,173.0
4,5,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",2.0869,106.0,6.237,754.0


In [163]:
# Step 1: Safely convert string to list of dicts (if it's stored as a string)
def parse_genres(g):
    if isinstance(g, str):
        try:
            return ast.literal_eval(g)
        except:
            return []
    return g

movies['genres_parsed'] = movies['genres'].apply(parse_genres)

# Step 2: Extract genre names
all_genres = set()
for genre_list in movies['genres_parsed']:
    for genre in genre_list:
        if isinstance(genre, dict) and 'name' in genre:
            all_genres.add(genre['name'])

# Step 3: Get as sorted list
all_genres = sorted(all_genres)
all_genres


['(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'IMAX',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [164]:
all_genres = all_genres[1:]
all_genres

['Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'IMAX',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Science Fiction',
 'TV Movie',
 'Thriller',
 'War',
 'Western']

In [165]:
for genre in all_genres:
    movie_dataset[genre] = movie_dataset['genres'].apply(lambda x: genre in x).astype(int)

In [166]:
movie_dataset.head()

Unnamed: 0,movieId,genres,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western
0,1,"[{'id': 16, 'name': 'Animation'}, {'id': 12, '...",21.4021,81.0,7.969,18889.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",3.0047,104.0,7.237,10783.0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1.4596,101.0,6.5,398.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,4,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1.612,127.0,6.3,173.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,5,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",2.0869,106.0,6.237,754.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [167]:
movie_dataset = movie_dataset.drop(columns='genres')

In [168]:
movie_dataset

Unnamed: 0,movieId,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western
0,1,21.4021,81.0,7.969,18889.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,3.0047,104.0,7.237,10783.0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1.4596,101.0,6.500,398.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
3,4,1.6120,127.0,6.300,173.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,5,2.0869,106.0,6.237,754.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,1.6639,100.0,7.600,93.0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
9738,193583,2.7855,106.0,7.800,408.0,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0
9739,193585,0.3624,96.0,6.806,18.0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
9740,193587,6.0790,90.0,8.123,163.0,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0


In [169]:
movie_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   movieId          9742 non-null   int64  
 1   popularity       9742 non-null   float64
 2   runtime          9742 non-null   float64
 3   vote_average     9742 non-null   float64
 4   vote_count       9742 non-null   float64
 5   Action           9742 non-null   int32  
 6   Adventure        9742 non-null   int32  
 7   Animation        9742 non-null   int32  
 8   Children         9742 non-null   int32  
 9   Comedy           9742 non-null   int32  
 10  Crime            9742 non-null   int32  
 11  Documentary      9742 non-null   int32  
 12  Drama            9742 non-null   int32  
 13  Family           9742 non-null   int32  
 14  Fantasy          9742 non-null   int32  
 15  History          9742 non-null   int32  
 16  Horror           9742 non-null   int32  
 17  IMAX          

In [170]:
movie_dataset.isna().sum()

movieId            0
popularity         0
runtime            0
vote_average       0
vote_count         0
Action             0
Adventure          0
Animation          0
Children           0
Comedy             0
Crime              0
Documentary        0
Drama              0
Family             0
Fantasy            0
History            0
Horror             0
IMAX               0
Music              0
Musical            0
Mystery            0
Romance            0
Sci-Fi             0
Science Fiction    0
TV Movie           0
Thriller           0
War                0
Western            0
dtype: int64

In [171]:
movie_dataset.to_csv('movies_feature_engineered.csv', index=False)

In [172]:
movie_dataset2 = pd.read_csv('movies_feature_engineered.csv')

Extended feature engineering

In [173]:
movies.columns

Index(['movieId', 'tmdbId', 'adult', 'backdrop_path', 'belongs_to_collection',
       'budget', 'genres', 'homepage', 'id', 'imdb_id', 'origin_country',
       'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'production_companies', 'production_countries',
       'release_date', 'revenue', 'runtime', 'spoken_languages', 'status',
       'tagline', 'title', 'video', 'vote_average', 'vote_count', 'imdbId',
       'genres_parsed'],
      dtype='object')

In [174]:
movies[['origin_country']].iloc[0,0]

"['US']"

In [175]:
movies[['production_companies']].iloc[3,0]

"[{'id': 25, 'logo_path': '/qZCc1lty5FzX30aOCVRBLzaVmcp.png', 'name': '20th Century Fox', 'origin_country': 'US'}]"

In [176]:
movies[['release_date']].iloc[3,0]

'1995-12-22'

In [177]:
all_countries = {}

for val in movies['origin_country'].dropna():
    try:
        countries_list = ast.literal_eval(val)
        for country in countries_list:
            if country not in all_countries:
                all_countries[country] = 0
            all_countries[country] += 1
    except Exception as e:
        print(f"Error parsing: {val} -> {e}")

print(all_countries)

{'US': 7507, 'GB': 1105, 'FR': 413, 'CN': 47, 'AU': 138, 'CA': 349, 'IT': 181, 'IR': 13, 'NL': 24, 'DE': 172, 'HK': 120, 'RU': 35, 'BE': 24, 'MK': 1, 'IE': 71, 'TW': 11, 'NZ': 28, 'MX': 36, 'JP': 277, 'CU': 2, 'DK': 66, 'ES': 87, 'RS': 4, 'IN': 25, 'SN': 1, 'AT': 25, 'CZ': 11, 'YU': 3, 'SU': 81, 'SE': 32, 'CH': 28, 'LU': 7, 'BA': 2, 'BR': 22, 'NO': 12, 'ZA': 9, 'BW': 1, 'AE': 6, 'HU': 7, 'TR': 6, 'AR': 14, 'PL': 12, 'LB': 1, 'BT': 1, 'BG': 5, 'CI': 1, 'KP': 1, 'NP': 1, 'HT': 1, 'FI': 18, 'CO': 3, 'DZ': 2, 'GR': 2, 'JM': 1, 'IL': 5, 'KR': 41, 'IS': 3, 'SG': 4, 'AF': 1, 'TH': 5, 'TJ': 1, 'UZ': 1, 'QA': 1, 'XC': 4, 'MN': 2, 'RO': 5, 'PS': 1, 'LI': 1, 'PT': 3, 'EE': 2, 'AN': 1, 'AW': 1, 'PH': 1, 'ID': 4, 'SA': 1, 'CL': 3, 'MY': 1, 'UA': 1, 'KH': 1, 'JO': 1}


In [178]:
sorted_countries = dict(sorted(all_countries.items(), key=lambda x: x[1], reverse=True))
print(sorted_countries)

{'US': 7507, 'GB': 1105, 'FR': 413, 'CA': 349, 'JP': 277, 'IT': 181, 'DE': 172, 'AU': 138, 'HK': 120, 'ES': 87, 'SU': 81, 'IE': 71, 'DK': 66, 'CN': 47, 'KR': 41, 'MX': 36, 'RU': 35, 'SE': 32, 'NZ': 28, 'CH': 28, 'IN': 25, 'AT': 25, 'NL': 24, 'BE': 24, 'BR': 22, 'FI': 18, 'AR': 14, 'IR': 13, 'NO': 12, 'PL': 12, 'TW': 11, 'CZ': 11, 'ZA': 9, 'LU': 7, 'HU': 7, 'AE': 6, 'TR': 6, 'BG': 5, 'IL': 5, 'TH': 5, 'RO': 5, 'RS': 4, 'SG': 4, 'XC': 4, 'ID': 4, 'YU': 3, 'CO': 3, 'IS': 3, 'PT': 3, 'CL': 3, 'CU': 2, 'BA': 2, 'DZ': 2, 'GR': 2, 'MN': 2, 'EE': 2, 'MK': 1, 'SN': 1, 'BW': 1, 'LB': 1, 'BT': 1, 'CI': 1, 'KP': 1, 'NP': 1, 'HT': 1, 'JM': 1, 'AF': 1, 'TJ': 1, 'UZ': 1, 'QA': 1, 'PS': 1, 'LI': 1, 'AN': 1, 'AW': 1, 'PH': 1, 'SA': 1, 'MY': 1, 'UA': 1, 'KH': 1, 'JO': 1}


In [179]:
top_countries = [k for k,v in sorted_countries.items() if v >= 20]
print(top_countries)
print(len(top_countries))

['US', 'GB', 'FR', 'CA', 'JP', 'IT', 'DE', 'AU', 'HK', 'ES', 'SU', 'IE', 'DK', 'CN', 'KR', 'MX', 'RU', 'SE', 'NZ', 'CH', 'IN', 'AT', 'NL', 'BE', 'BR']
25


In [180]:
all_companies = {}

for val in movies['production_companies'].dropna():
    try:
        companies_dict = ast.literal_eval(val)
        for company in companies_dict:
            if company['name'] not in companies_dict:
                all_companies[company['name']] = 1
            else:
                all_companies[company['name']] += 1
    except Exception as e:
        print(f"Error parsing: {val} -> {e}")

unique_companies = sorted([company for company in all_companies.keys()])
print(unique_companies)
print(len(unique_companies))

['"DIA" Productions GmbH & Co. KG', '(Colossal) Pictures', '.406 Production', '10 by 10 Entertainment', '100 Bares', '100% Film & Photography', '101st Street Films', '108 Media', '10th Hole Productions', '12 Gauge Productions', '120 Films', '120dB Films', '129 Productions', '1492 Pictures', '16 Block Productions', '1600 Limited Partnership', '16:14 Entertainment', '1821 Pictures', '19 Entertainment', '1978 Films', '1984 Private Defense Contractors', '2 Entertain', '2 Loop Films', '2.4.7. Films', '2003 Productions', '2008NY5', '20th Century Fox', '20th Century Fox Animation', '20th Century Fox Brazil', '20th Century Fox Home Entertainment', '20th Century Fox Korea', '20th Century Fox Television', '21 Century Shengkai Film', '21 Laps Entertainment', '21st Century Film Corporation', '22nd & Indiana Pictures', '23/5 Filmproduktion', '235 Films', '25th Hour Productions', '26 Films', '27 Films Production', '2929 Entertainment', '2929 Productions', '2D Celluloid', '2DUX²', '2S Films', '2d3D A

In [181]:
all_companies

{'Pixar': 1,
 'TriStar Pictures': 1,
 'Interscope Communications': 1,
 'Teitler Film': 1,
 'PolyGram Filmed Entertainment': 1,
 'Lancaster Gate': 1,
 'Warner Bros. Pictures': 1,
 '20th Century Fox': 1,
 'Touchstone Pictures': 1,
 'Sandollar Productions': 1,
 'Regency Enterprises': 1,
 'Forward Pass': 1,
 'Paramount Pictures': 1,
 'Constellation Films': 1,
 'Mirage Enterprises': 1,
 'Scott Rudin Productions': 1,
 'Worldwide Productions': 1,
 'Mont Blanc Entertainment GmbH': 1,
 'Walt Disney Pictures': 1,
 'Painted Fence Productions': 1,
 'Shattered Productions': 1,
 'Universal Pictures': 1,
 'Imperial Entertainment': 1,
 'Signature Entertainment': 1,
 'EON Productions': 1,
 'Castle Rock Entertainment': 1,
 'Wildwood Enterprises': 1,
 'Columbia Pictures': 1,
 'Brooksfilms': 1,
 'Enigma Pictures': 1,
 'Amblin Entertainment': 1,
 'Amblimation': 1,
 'Cinergi Pictures': 1,
 'Hollywood Pictures': 1,
 'Illusion Entertainment Group': 1,
 'Carolco Pictures': 1,
 'Forge': 1,
 'Laurence Mark Produ

In [182]:
[c for c in unique_companies if (c[0]=='P' or c[0]=='p') and (c[1]=='I' or c[1]=='i') and (c[2]=='X' or c[2]=='x')]

['Pixar', 'Pixibox', 'Pixoloid Studios']

In [183]:
[c for c in unique_companies if 'fox' in c.lower()]

['20th Century Fox',
 '20th Century Fox Animation',
 '20th Century Fox Brazil',
 '20th Century Fox Home Entertainment',
 '20th Century Fox Korea',
 '20th Century Fox Television',
 'Canafox Films',
 'Fox 2000 Pictures',
 'Fox 21 Television Studios',
 'Fox Animation Studios',
 'Fox Atomic',
 'Fox Family Films',
 'Fox Film Corporation',
 'Fox Films Ltd.',
 'Fox International Productions',
 'Fox International Productions Japan',
 'Fox International Productions Korea',
 'Fox Searchlight Pictures',
 'Fox Television Animation',
 'Fox Television Studios',
 'Fox West Pictures',
 'Foxton Entertainment',
 'Foxtor Productions',
 'Robert Fox Productions',
 'White Fox']

In [184]:
known_studios = ['pixar', 'marvel', 'disney', 'lucasfilm', 'paramount', 
                 'universal pictures', 'dreamworks', 'lionsgate', 'sony', 'columbia pictures',
                 'tristar', 'new line', 'warner bros', 'miramax', 'a24', 
                 'studiocanal', 'pathé', 'toho', 'ghibli', 'annapurna', 'orion', 'mgm','century fox']


In [185]:
company_dict_with_keyword = {}
for studio in known_studios:
    company_dict_with_keyword[studio] = [c for c in unique_companies if studio in c.lower()]

In [186]:
[(k,(len(v))) for k,v in company_dict_with_keyword.items()]

[('pixar', 1),
 ('marvel', 9),
 ('disney', 14),
 ('lucasfilm', 2),
 ('paramount', 9),
 ('universal pictures', 5),
 ('dreamworks', 3),
 ('lionsgate', 4),
 ('sony', 14),
 ('columbia pictures', 2),
 ('tristar', 6),
 ('new line', 2),
 ('warner bros', 12),
 ('miramax', 2),
 ('a24', 1),
 ('studiocanal', 1),
 ('pathé', 13),
 ('toho', 6),
 ('ghibli', 1),
 ('annapurna', 2),
 ('orion', 6),
 ('mgm', 3),
 ('century fox', 6)]

In [187]:
for k,v in company_dict_with_keyword.items():
    print(k)
    print(v)
    print('-'*20)

pixar
['Pixar']
--------------------
marvel
['Marvel Animation', 'Marvel Characters', 'Marvel Enterprises', 'Marvel Entertainment', 'Marvel Entertainment Group', 'Marvel Films', 'Marvel Knights', 'Marvel Productions', 'Marvel Studios']
--------------------
disney
['Disney Channel', 'Disney Television Animation', 'DisneyToon Studios', 'Disneynature', 'The Walt Disney Company (Japan)', 'The Walt Disney Studios', 'Walt Disney Animation', 'Walt Disney Animation Studios', 'Walt Disney Feature Animation', 'Walt Disney Home Video', 'Walt Disney Pictures', 'Walt Disney Productions', 'Walt Disney Studio', 'Walt Disney Television']
--------------------
lucasfilm
['Lucasfilm Animation', 'Lucasfilm Ltd.']
--------------------
paramount
['Paramount Classics', 'Paramount Famous Productions', 'Paramount Home Entertainment', 'Paramount Pictures', 'Paramount Pictures Canada', 'Paramount Pictures Digital Entertainment', 'Paramount Television', 'Paramount Television Studios', 'Paramount Vantage']
-------

In [188]:
def extract_company_names(cell):
    if isinstance(cell, str) and cell.startswith("[{"):
        try:
            parsed = ast.literal_eval(cell)
            return [d['name'].lower() for d in parsed if 'name' in d]
        except Exception as e:
            return []
    return []

# Create a helper column with list of lowercase company names
movie_dataset['parsed_companies'] = movies['production_companies'].apply(extract_company_names)

# Now create binary columns for known studios
for studio in known_studios:
    studio_lower = studio.lower()
    movie_dataset[studio] = movie_dataset['parsed_companies'].apply(
        lambda comps: int(any(studio_lower in c for c in comps))
    )


In [189]:
movie_dataset['pixar_or_disney'] = ((movie_dataset['pixar'] == 1) | (movie_dataset['disney'] == 1)).astype(int)

In [190]:
movie_dataset.head(5)

Unnamed: 0,movieId,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,parsed_companies,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney
0,1,21.4021,81.0,7.969,18889.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[pixar],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,2,3.0047,104.0,7.237,10783.0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,"[tristar pictures, interscope communications, ...",0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1.4596,101.0,6.5,398.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,"[lancaster gate, warner bros. pictures]",0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,4,1.612,127.0,6.3,173.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,[20th century fox],0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,5,2.0869,106.0,6.237,754.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[touchstone pictures, sandollar productions]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [191]:
for country in top_countries:
    movie_dataset[country] = movies['origin_country'].apply(
        lambda x: int(country in x) if isinstance(x, str) else 0
    )


In [192]:
movie_dataset[movie_dataset['pixar']==1].head()

Unnamed: 0,movieId,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,parsed_companies,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney,US,GB,FR,CA,JP,IT,DE,AU,HK,ES,SU,IE,DK,CN,KR,MX,RU,SE,NZ,CH,IN,AT,NL,BE,BR
0,1,21.4021,81.0,7.969,18889.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[pixar],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1757,2355,12.1886,95.0,6.972,9350.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"[walt disney pictures, pixar]",1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2355,3114,19.3849,92.0,7.599,14185.0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[pixar],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3568,4886,19.9898,92.0,7.844,18897.0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[pixar],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4360,6377,15.8391,100.0,7.816,19702.0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,[pixar],1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [193]:
movie_dataset = movie_dataset.drop(columns='parsed_companies')

In [194]:
movie_dataset.head()

Unnamed: 0,movieId,popularity,runtime,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney,US,GB,FR,CA,JP,IT,DE,AU,HK,ES,SU,IE,DK,CN,KR,MX,RU,SE,NZ,CH,IN,AT,NL,BE,BR
0,1,21.4021,81.0,7.969,18889.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,3.0047,104.0,7.237,10783.0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1.4596,101.0,6.5,398.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1.612,127.0,6.3,173.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2.0869,106.0,6.237,754.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [195]:
movie_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 77 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieId             9742 non-null   int64  
 1   popularity          9742 non-null   float64
 2   runtime             9742 non-null   float64
 3   vote_average        9742 non-null   float64
 4   vote_count          9742 non-null   float64
 5   Action              9742 non-null   int32  
 6   Adventure           9742 non-null   int32  
 7   Animation           9742 non-null   int32  
 8   Children            9742 non-null   int32  
 9   Comedy              9742 non-null   int32  
 10  Crime               9742 non-null   int32  
 11  Documentary         9742 non-null   int32  
 12  Drama               9742 non-null   int32  
 13  Family              9742 non-null   int32  
 14  Fantasy             9742 non-null   int32  
 15  History             9742 non-null   int32  
 16  Horror

In [196]:
movie_dataset.to_csv('movies_feature_engineered.csv',index=False)

In [None]:
movie_dataset['runtime'].max(),movie_dataset['runtime'].min()

(583.0, 2.0)

In [None]:
def ohe_num(df,col,start,end,step):
    df = df.copy()
    bins = [r for r in range(start,end+1,step)]
    labels = [f"{col}_{start}_{start+step}" for start in bins[:-1]]
    df[f'{col}_bin'] = pd.cut(
        df[col],
        bins=bins,
        labels=labels,
        right=False,
    )
    one_hot = pd.get_dummies(df[f'{col}_bin'])
    df = pd.concat([df, one_hot.reindex(df.index).fillna(0).astype(int)], axis=1)
    df = df.drop(columns=[col,f'{col}_bin'])
    return df

In [None]:
movie_dataset = ohe_num(movie_dataset,'runtime',0,600,30)

In [None]:
movie_dataset.head()

Unnamed: 0,movieId,popularity,vote_average,vote_count,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney,US,GB,FR,CA,JP,IT,DE,AU,HK,ES,SU,IE,DK,CN,KR,MX,RU,SE,NZ,CH,IN,AT,NL,BE,BR,runtime_0_30,runtime_30_60,runtime_60_90,runtime_90_120,runtime_120_150,runtime_150_180,runtime_180_210,runtime_210_240,runtime_240_270,runtime_270_300,runtime_300_330,runtime_330_360,runtime_360_390,runtime_390_420,runtime_420_450,runtime_450_480,runtime_480_510,runtime_510_540,runtime_540_570,runtime_570_600
0,1,21.4021,7.969,18889.0,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,3.0047,7.237,10783.0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,3,1.4596,6.5,398.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,4,1.612,6.3,173.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5,2.0869,6.237,754.0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
movie_dataset['vote_count'].max(),movie_dataset['vote_count'].min()

(37552.0, 0.0)

In [None]:
movie_dataset = ohe_num(movie_dataset,'vote_count',0,40000,5000)

In [None]:
movie_dataset

Unnamed: 0,movieId,popularity,vote_average,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney,...,CA,JP,IT,DE,AU,HK,ES,SU,IE,DK,CN,KR,MX,RU,SE,NZ,CH,IN,AT,NL,BE,BR,runtime_0_30,runtime_30_60,runtime_60_90,runtime_90_120,runtime_120_150,runtime_150_180,runtime_180_210,runtime_210_240,runtime_240_270,runtime_270_300,runtime_300_330,runtime_330_360,runtime_360_390,runtime_390_420,runtime_420_450,runtime_450_480,runtime_480_510,runtime_510_540,runtime_540_570,runtime_570_600,vote_count_0_5000,vote_count_5000_10000,vote_count_10000_15000,vote_count_15000_20000,vote_count_20000_25000,vote_count_25000_30000,vote_count_30000_35000,vote_count_35000_40000
0,1,21.4021,7.969,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,2,3.0047,7.237,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,3,1.4596,6.500,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,4,1.6120,6.300,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,5,2.0869,6.237,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,193581,1.6639,7.600,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9738,193583,2.7855,7.800,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9739,193585,0.3624,6.806,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
9740,193587,6.0790,8.123,1,0,1,0,1,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


In [None]:
movie_dataset['vote_average'].max(),movie_dataset['vote_average'].min()

(8.941, 0.0)

In [None]:
movie_dataset = ohe_num(movie_dataset,'vote_average',0,9,1)

In [None]:
movie_dataset.head()

Unnamed: 0,movieId,popularity,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney,US,...,DK,CN,KR,MX,RU,SE,NZ,CH,IN,AT,NL,BE,BR,runtime_0_30,runtime_30_60,runtime_60_90,runtime_90_120,runtime_120_150,runtime_150_180,runtime_180_210,runtime_210_240,runtime_240_270,runtime_270_300,runtime_300_330,runtime_330_360,runtime_360_390,runtime_390_420,runtime_420_450,runtime_450_480,runtime_480_510,runtime_510_540,runtime_540_570,runtime_570_600,vote_count_0_5000,vote_count_5000_10000,vote_count_10000_15000,vote_count_15000_20000,vote_count_20000_25000,vote_count_25000_30000,vote_count_30000_35000,vote_count_35000_40000,vote_average_0_1,vote_average_1_2,vote_average_2_3,vote_average_3_4,vote_average_4_5,vote_average_5_6,vote_average_6_7,vote_average_7_8,vote_average_8_9
0,1,21.4021,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
1,2,3.0047,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,3,1.4596,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,1.612,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,5,2.0869,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [None]:
movie_dataset['popularity'].max(),movie_dataset['popularity'].min()

(76.235, 0.0042)

In [None]:
movie_dataset = ohe_num(movie_dataset,'popularity',0,22,2)

In [None]:
movie_dataset.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Family,Fantasy,History,Horror,IMAX,Music,Musical,Mystery,Romance,Sci-Fi,Science Fiction,TV Movie,Thriller,War,Western,pixar,marvel,disney,lucasfilm,paramount,universal pictures,dreamworks,lionsgate,sony,columbia pictures,tristar,new line,warner bros,miramax,a24,studiocanal,pathé,toho,ghibli,annapurna,orion,mgm,century fox,pixar_or_disney,US,GB,...,BE,BR,runtime_0_30,runtime_30_60,runtime_60_90,runtime_90_120,runtime_120_150,runtime_150_180,runtime_180_210,runtime_210_240,runtime_240_270,runtime_270_300,runtime_300_330,runtime_330_360,runtime_360_390,runtime_390_420,runtime_420_450,runtime_450_480,runtime_480_510,runtime_510_540,runtime_540_570,runtime_570_600,vote_count_0_5000,vote_count_5000_10000,vote_count_10000_15000,vote_count_15000_20000,vote_count_20000_25000,vote_count_25000_30000,vote_count_30000_35000,vote_count_35000_40000,vote_average_0_1,vote_average_1_2,vote_average_2_3,vote_average_3_4,vote_average_4_5,vote_average_5_6,vote_average_6_7,vote_average_7_8,vote_average_8_9,popularity_0_2,popularity_2_4,popularity_4_6,popularity_6_8,popularity_8_10,popularity_10_12,popularity_12_14,popularity_14_16,popularity_16_18,popularity_18_20,popularity_20_22
0,1,0,1,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,2,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0


In [None]:
movie_dataset.to_csv('movies_feature_engineered.csv',index=False)