In [1]:
# Import the dependencies
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib import pyplot as plt
import seaborn as sns
import random
%matplotlib inline

In [2]:
df = pd.read_csv('../zippedData/tmdb.movies.csv.gz')
df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         26517 non-null  int64  
 1   genre_ids          26517 non-null  object 
 2   id                 26517 non-null  int64  
 3   original_language  26517 non-null  object 
 4   original_title     26517 non-null  object 
 5   popularity         26517 non-null  float64
 6   release_date       26517 non-null  object 
 7   title              26517 non-null  object 
 8   vote_average       26517 non-null  float64
 9   vote_count         26517 non-null  int64  
dtypes: float64(2), int64(3), object(5)
memory usage: 2.0+ MB


In [4]:
# Convert the 'release_date' column to datetime type
df['release_date'] = pd.to_datetime(df['release_date'])
df.head()

Unnamed: 0.1,Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   Unnamed: 0         26517 non-null  int64         
 1   genre_ids          26517 non-null  object        
 2   id                 26517 non-null  int64         
 3   original_language  26517 non-null  object        
 4   original_title     26517 non-null  object        
 5   popularity         26517 non-null  float64       
 6   release_date       26517 non-null  datetime64[ns]
 7   title              26517 non-null  object        
 8   vote_average       26517 non-null  float64       
 9   vote_count         26517 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(3), object(4)
memory usage: 2.0+ MB


In [6]:
#Need to drop the 'Unnamed: 0' column to check for any duplicate rows in this table.
df = df.drop('Unnamed: 0', axis=1)
df.head()

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
0,"[12, 14, 10751]",12444,en,Harry Potter and the Deathly Hallows: Part 1,33.533,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,"[14, 12, 16, 10751]",10191,en,How to Train Your Dragon,28.734,2010-03-26,How to Train Your Dragon,7.7,7610
2,"[12, 28, 878]",10138,en,Iron Man 2,28.515,2010-05-07,Iron Man 2,6.8,12368
3,"[16, 35, 10751]",862,en,Toy Story,28.005,1995-11-22,Toy Story,7.9,10174
4,"[28, 878, 12]",27205,en,Inception,27.92,2010-07-16,Inception,8.3,22186


In [7]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,26517.0,26517.0,26517.0,26517.0
mean,295050.15326,3.130912,5.991281,194.224837
std,153661.615648,4.355229,1.852946,960.961095
min,27.0,0.6,0.0,1.0
25%,157851.0,0.6,5.0,2.0
50%,309581.0,1.374,6.0,5.0
75%,419542.0,3.694,7.0,28.0
max,608444.0,80.773,10.0,22186.0


In [8]:
# Find duplicate rows
duplicate_rows = df[df.duplicated(keep=False)]

# Sort duplicate rows for better readability
duplicate_rows = duplicate_rows.sort_values(by=list(df.columns))
duplicate_rows

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
14538,"[10402, 10749, 35, 18]",207936,en,Tumbledown,7.004,2015-04-18,Tumbledown,6.4,88
17813,"[10402, 10749, 35, 18]",207936,en,Tumbledown,7.004,2015-04-18,Tumbledown,6.4,88
10532,"[10402, 18, 10749, 35]",350331,en,B-Side,0.600,2015-07-21,B-Side,5.8,2
17037,"[10402, 18, 10749, 35]",350331,en,B-Side,0.600,2015-07-21,B-Side,5.8,2
14755,"[10402, 18, 80, 35]",340275,en,Chi-Raq,5.310,2015-12-04,Chi-Raq,5.7,95
...,...,...,...,...,...,...,...,...,...
23294,[],483720,en,Bunker77,0.600,2017-10-27,Bunker77,8.0,1
19987,[],495988,en,Calla Lily,0.600,2016-05-22,Calla Lily,8.0,1
23280,[],495988,en,Calla Lily,0.600,2016-05-22,Calla Lily,8.0,1
22365,[],521494,en,Transformers: Titans Return,1.180,2017-11-14,Transformers: Titans Return,4.5,2


In [9]:
# Drop duplicate rows in place
df.drop_duplicates(inplace=True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25497 entries, 0 to 26516
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   genre_ids          25497 non-null  object        
 1   id                 25497 non-null  int64         
 2   original_language  25497 non-null  object        
 3   original_title     25497 non-null  object        
 4   popularity         25497 non-null  float64       
 5   release_date       25497 non-null  datetime64[ns]
 6   title              25497 non-null  object        
 7   vote_average       25497 non-null  float64       
 8   vote_count         25497 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 1.9+ MB


In [11]:
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,25497.0,25497.0,25497.0,25497.0
mean,294203.960505,3.043279,5.979331,178.79578
std,154690.24966,4.261045,1.866094,914.150311
min,27.0,0.6,0.0,1.0
25%,154770.0,0.6,5.0,1.0
50%,307125.0,1.321,6.0,5.0
75%,420707.0,3.49,7.0,25.0
max,608444.0,80.773,10.0,22186.0


In [12]:
df['title'].value_counts()

Home                                    7
Alone                                   5
Aftermath                               5
Truth or Dare                           5
The Gift                                5
                                       ..
Office Christmas Party                  1
Once Upon ay Time in Mumbai Dobaara!    1
The Children Send Their Regards         1
The Receipt: Bananas Town               1
Mr. Gaga                                1
Name: title, Length: 24688, dtype: int64

In [13]:
home_rows = df[df['title'] == 'Home']
home_rows

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
3525,"[18, 9648]",79580,ru,Dom,1.823,2011-11-11,Home,5.4,11
9651,[],237303,en,Home,0.84,2013-11-22,Home,7.0,2
14262,"[14, 35, 16, 878, 10751]",228161,en,Home,12.835,2015-03-27,Home,6.8,2372
19580,[],409371,en,Home,0.672,2016-08-01,Home,7.0,1
20446,"[27, 18, 10751]",381507,en,Home,0.6,2016-03-01,Home,4.7,14
20455,[18],388223,en,Home,0.6,2016-03-17,Home,4.5,2
23694,[878],496256,en,Home,0.6,2017-04-30,Home,4.0,1


In [14]:
df['original_title'].value_counts()

Home                                                6
Eden                                                5
Lucky                                               5
Aftermath                                           5
Truth or Dare                                       5
                                                   ..
Любит не любит                                      1
How Star Wars Was Saved in the Edit                 1
Filmstar Fish: Struggle For Survival                1
Elton John: I'm Still Standing - A Grammy Salute    1
Attack Of The Southern Fried Zombies                1
Name: original_title, Length: 24835, dtype: int64

In [15]:
df['original_language'].value_counts()
#22,384 movies originally made in english
#We might want to drop all foreign films.

en    22384
fr      483
es      439
ru      295
ja      244
      ...  
bo        1
ps        1
af        1
xh        1
hy        1
Name: original_language, Length: 76, dtype: int64

In [16]:
# only keeping english language films.
df = df.loc[df['original_language'] == 'en']
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 22384 entries, 0 to 26516
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   genre_ids          22384 non-null  object        
 1   id                 22384 non-null  int64         
 2   original_language  22384 non-null  object        
 3   original_title     22384 non-null  object        
 4   popularity         22384 non-null  float64       
 5   release_date       22384 non-null  datetime64[ns]
 6   title              22384 non-null  object        
 7   vote_average       22384 non-null  float64       
 8   vote_count         22384 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(2), object(4)
memory usage: 1.7+ MB


In [17]:
df['vote_average'].max()

10.0

In [18]:
# Show rows where vote_average is equal to 10
#Here we show that some movies only have one vote, and were voted a 10.
#We might want to only have movies with a vote count of 30/100 or more. Maybe even the mean value.
vote_average_10 = df[df['vote_average'] == 10]
vote_average_10

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
770,"[28, 80, 18, 53]",51488,en,Full Love,2.288,2010-01-01,Full Love,10.0,1
1154,[16],130974,en,A Cloudy Lesson,1.374,2010-04-01,A Cloudy Lesson,10.0,1
1230,[],371702,en,All That Glitters,1.241,2010-09-25,All That Glitters,10.0,1
1277,[18],62503,en,Almost Kings,1.154,2010-11-11,Almost Kings,10.0,2
1296,[35],140489,en,The Mother Of Invention,1.124,2010-06-25,The Mother Of Invention,10.0,1
...,...,...,...,...,...,...,...,...,...
26252,"[99, 10402]",501350,en,The Doors: Live at the Isle of Wight Festival ...,0.600,2018-02-23,The Doors - Live at the Isle of Wight Festival...,10.0,1
26253,[18],499046,en,El Gallo,0.600,2018-01-16,El Gallo,10.0,1
26255,"[35, 18, 10749]",478883,en,Laura Gets a Cat,0.600,2018-06-13,Laura Gets a Cat,10.0,1
26256,[99],474415,en,Twilight of the Yakuza,0.600,2018-02-01,Twilight of the Yakuza,10.0,1


In [19]:
df['vote_count'].value_counts()
#A lot of movies with only one vote. We might need to drop these rows. The median is 5 votes.

1       5989
2       2761
3       1532
4       1177
5        826
        ... 
1033       1
1065       1
1097       1
1193       1
2049       1
Name: vote_count, Length: 1637, dtype: int64

In [20]:
# Keep only rows where vote_count is greater than or equal to 5
df = df[df['vote_count'] >= 5]
df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count
count,10925.0,10925.0,10925.0,10925.0
mean,260734.636522,5.245315,5.846307,393.035698
std,150831.555294,5.402296,1.239714,1356.642588
min,27.0,0.6,0.0,5.0
25%,112130.0,1.681,5.1,9.0
50%,270303.0,3.208,6.0,24.0
75%,388243.0,7.249,6.7,119.0
max,589023.0,80.773,10.0,22186.0


In [21]:
# Show rows where vote_average is equal to 10
#Here we show that some movies only have one vote, and were voted a 10.
#We might want to only have movies with a vote count of 30/100 or more. Maybe even the mean value.
vote_average_10 = df[df['vote_average'] == 10]
vote_average_10

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
3687,"[35, 14, 18, 10749]",307124,en,Love on a Leash,1.4,2013-08-13,Love on a Leash,10.0,7
