# PREPROCESSING THE DATASET

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [9]:
# load the dataset
df = pd.read_csv('tmdb_movies_data.csv')

In [10]:
df.shape

(10866, 21)

In [3]:
# analyse the dataset
df.head()


Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0
3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,...,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,12/15/2015,5292,7.5,2015,183999919.0,1902723000.0
4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,...,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,4/1/2015,2947,7.3,2015,174799923.1,1385749000.0


# CHECKING THE MISSING VALUES

In [4]:
# check for missing values
missing_values = df.isnull().sum()
print(missing_values)

#total missing values
total_missing_values = missing_values.sum()
print('Total missing values in training data:', total_missing_values)

id                         0
imdb_id                   10
popularity                 0
budget                     0
revenue                    0
original_title             0
cast                      76
homepage                7930
director                  44
tagline                 2824
keywords                1493
overview                   4
runtime                    0
genres                    23
production_companies    1030
release_date               0
vote_count                 0
vote_average               0
release_year               0
budget_adj                 0
revenue_adj                0
dtype: int64
Total missing values in training data: 13434


# DROPPING IRRELEVANT FEATURES TO SIMPLIFY THE DATASET

In [5]:
# dropping irrelevant columns
features_to_drop = ['cast', 'homepage', 'tagline','original_title', 'keywords', 'overview', 'director', 'imdb_id','production_companies']

# Drop the specified features
df = df.drop(features_to_drop, axis=1)

# Display the updated DataFrame
print(df.head())

       id  popularity     budget     revenue  runtime  \
0  135397   32.985763  150000000  1513528810      124   
1   76341   28.419936  150000000   378436354      120   
2  262500   13.112507  110000000   295238201      119   
3  140607   11.173104  200000000  2068178225      136   
4  168259    9.335014  190000000  1506249360      137   

                                      genres release_date  vote_count  \
0  Action|Adventure|Science Fiction|Thriller     6/9/2015        5562   
1  Action|Adventure|Science Fiction|Thriller    5/13/2015        6185   
2         Adventure|Science Fiction|Thriller    3/18/2015        2480   
3   Action|Adventure|Science Fiction|Fantasy   12/15/2015        5292   
4                      Action|Crime|Thriller     4/1/2015        2947   

   vote_average  release_year   budget_adj   revenue_adj  
0           6.5          2015  137999939.3  1.392446e+09  
1           7.1          2015  137999939.3  3.481613e+08  
2           6.3          2015  101199955.

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

id               0
popularity       0
budget           0
revenue          0
runtime          0
genres          23
release_date     0
vote_count       0
vote_average     0
release_year     0
budget_adj       0
revenue_adj      0
dtype: int64


# DROPPING DATAPOINTS WITH MISSING VALUES

In [7]:
# drop rows with missing values
df = df.dropna()

In [8]:
# check for missing values
missing_values = df.isnull().sum()
print(missing_values)
print(df.shape)

id              0
popularity      0
budget          0
revenue         0
runtime         0
genres          0
release_date    0
vote_count      0
vote_average    0
release_year    0
budget_adj      0
revenue_adj     0
dtype: int64
(10843, 12)


# ANALYSING THE DATATYPE OF EACH FEATURE TO DECIDE THE ENCODING TECHNIQUE

In [184]:
# Display the data types of each column
print("Data Types of Each Feature:")
print(df.dtypes)

Data Types of Each Feature:
id                int64
popularity      float64
budget            int64
revenue           int64
runtime           int64
genres           object
release_date     object
vote_count        int64
vote_average    float64
release_year      int64
budget_adj      float64
revenue_adj     float64
dtype: object


# ENCODING THE FEATURES

[1] GENRES

In [187]:
# Extract the first genre from the delimited entries
df['first_genre'] = df['genres'].str.split('|').str[0]

# Drop the original 'genres' column (optional)
df = df.drop('genres', axis=1)

# Display the updated DataFrame
print("\nUpdated Dataset with First Genre:")
print(df.head())



Updated Dataset with First Genre:
       id  popularity     budget     revenue  runtime release_date  \
0  135397   32.985763  150000000  1513528810      124     6/9/2015   
1   76341   28.419936  150000000   378436354      120    5/13/2015   
2  262500   13.112507  110000000   295238201      119    3/18/2015   
3  140607   11.173104  200000000  2068178225      136   12/15/2015   
4  168259    9.335014  190000000  1506249360      137     4/1/2015   

   vote_count  vote_average  release_year   budget_adj   revenue_adj  \
0        5562           6.5          2015  137999939.3  1.392446e+09   
1        6185           7.1          2015  137999939.3  3.481613e+08   
2        2480           6.3          2015  101199955.5  2.716190e+08   
3        5292           7.5          2015  183999919.0  1.902723e+09   
4        2947           7.3          2015  174799923.1  1.385749e+09   

  first_genre  
0      Action  
1      Action  
2   Adventure  
3      Action  
4      Action  


In [188]:
# total unique values in each column
unique_values = df.nunique()
print(unique_values)

id              10842
popularity      10791
budget            557
revenue          4702
runtime           246
release_date     5904
vote_count       1289
vote_average       72
release_year       56
budget_adj       2613
revenue_adj      4840
first_genre        20
dtype: int64


In [189]:
# Applying label encoding to the 'genre' column
label_encoder = LabelEncoder()
df['genre_encoded'] = label_encoder.fit_transform(df['first_genre'])

# drop the 'first_genre' column (optional)
df = df.drop('first_genre', axis=1)

# Display the updated DataFrame
print(df)


           id  popularity     budget     revenue  runtime release_date  \
0      135397   32.985763  150000000  1513528810      124     6/9/2015   
1       76341   28.419936  150000000   378436354      120    5/13/2015   
2      262500   13.112507  110000000   295238201      119    3/18/2015   
3      140607   11.173104  200000000  2068178225      136   12/15/2015   
4      168259    9.335014  190000000  1506249360      137     4/1/2015   
...       ...         ...        ...         ...      ...          ...   
10861      21    0.080598          0           0       95    6/15/1966   
10862   20379    0.065543          0           0      176   12/21/1966   
10863   39768    0.065141          0           0       94     1/1/1966   
10864   21449    0.064317          0           0       80    11/2/1966   
10865   22293    0.035919      19000           0       74   11/15/1966   

       vote_count  vote_average  release_year    budget_adj   revenue_adj  \
0            5562           6.5   

In [190]:
# Display the updated DataFrame
print("\nUpdated DataFrame with Label Encoding:")
print(df.head())


Updated DataFrame with Label Encoding:
       id  popularity     budget     revenue  runtime release_date  \
0  135397   32.985763  150000000  1513528810      124     6/9/2015   
1   76341   28.419936  150000000   378436354      120    5/13/2015   
2  262500   13.112507  110000000   295238201      119    3/18/2015   
3  140607   11.173104  200000000  2068178225      136   12/15/2015   
4  168259    9.335014  190000000  1506249360      137     4/1/2015   

   vote_count  vote_average  release_year   budget_adj   revenue_adj  \
0        5562           6.5          2015  137999939.3  1.392446e+09   
1        6185           7.1          2015  137999939.3  3.481613e+08   
2        2480           6.3          2015  101199955.5  2.716190e+08   
3        5292           7.5          2015  183999919.0  1.902723e+09   
4        2947           7.3          2015  174799923.1  1.385749e+09   

   genre_encoded  
0              0  
1              0  
2              1  
3              0  
4          

[2] MONTH OF RELEASE

In [191]:
# Extract the first genre from the delimited entries
df['month of release'] = df['release_date'].str.split('/').str[0]

# Drop the original 'genres' column (optional)
df = df.drop('release_date', axis=1)

# Change the datatype of the 'column_name' to float
df['month of release'] = df['month of release'].astype(int)

# Display the updated DataFrame
print("\nUpdated Dataset with Month of Release:")
print(df.head())


Updated Dataset with Month of Release:
       id  popularity     budget     revenue  runtime  vote_count  \
0  135397   32.985763  150000000  1513528810      124        5562   
1   76341   28.419936  150000000   378436354      120        6185   
2  262500   13.112507  110000000   295238201      119        2480   
3  140607   11.173104  200000000  2068178225      136        5292   
4  168259    9.335014  190000000  1506249360      137        2947   

   vote_average  release_year   budget_adj   revenue_adj  genre_encoded  \
0           6.5          2015  137999939.3  1.392446e+09              0   
1           7.1          2015  137999939.3  3.481613e+08              0   
2           6.3          2015  101199955.5  2.716190e+08              1   
3           7.5          2015  183999919.0  1.902723e+09              0   
4           7.3          2015  174799923.1  1.385749e+09              0   

   month of release  
0                 6  
1                 5  
2                 3  
3     

In [192]:
# datatypes of each column
print(df.dtypes)

id                    int64
popularity          float64
budget                int64
revenue               int64
runtime               int64
vote_count            int64
vote_average        float64
release_year          int64
budget_adj          float64
revenue_adj         float64
genre_encoded         int32
month of release      int32
dtype: object


In [194]:
df.shape

(10843, 12)

In [193]:
# saving the preprocessed dataset in a csv file
df.to_csv('tmdb_movies_data_preprocessed.csv', index=False)