In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import json
from scipy import stats
import scipy

In [2]:
basics = pd.read_csv('Data/title_basics.csv.gz',low_memory = False)
akas = pd.read_csv('Data/title_akas.csv.gz',low_memory = False)
ratings = pd.read_csv('Data/title_ratings.csv.gz', low_memory = False)
year_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz', low_memory = False)
year_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv', low_memory = False)
year_2010 = pd.read_csv('Data/final_tmdb_data_2010.csv', low_memory = False)
year_2011 = pd.read_csv('Data/final_tmdb_data_2011.csv', low_memory = False, lineterminator='\n')
year_2012 = pd.read_csv('Data/final_tmdb_data_2012.csv', low_memory = False, lineterminator='\n')
year_2013 = pd.read_csv('Data/final_tmdb_data_2013.csv', low_memory = False)
year_2014 = pd.read_csv('Data/final_tmdb_data_2014.csv', low_memory = False, lineterminator='\n')
year_2015 = pd.read_csv('Data/final_tmdb_data_2015.csv', low_memory = False, lineterminator='\n')
year_2016 = pd.read_csv('Data/final_tmdb_data_2016.csv', low_memory = False)
year_2017 = pd.read_csv('Data/final_tmdb_data_2017.csv', low_memory = False, lineterminator='\n')
year_2018 = pd.read_csv('Data/final_tmdb_data_2018.csv', low_memory = False, lineterminator='\n')
year_2019 = pd.read_csv('Data/final_tmdb_data_2019.csv', low_memory = False, lineterminator='\n')
year_2020 = pd.read_csv('Data/final_tmdb_data_2020.csv', low_memory = False, lineterminator='\n')
year_2021 = pd.read_csv('Data/final_tmdb_data_2021.csv', low_memory = False)

In [3]:
tmdb_df = pd.concat([year_2000, year_2001, year_2010,year_2011, year_2012, 
                     year_2013,year_2014, year_2015, year_2016, year_2017, 
                     year_2018, year_2019, year_2020, year_2021])
tmdb_df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.11,2162.0,PG


In [4]:
tmdb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46648 entries, 0 to 3532
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                46648 non-null  object 
 1   adult                  46634 non-null  float64
 2   backdrop_path          30144 non-null  object 
 3   belongs_to_collection  2758 non-null   object 
 4   budget                 46634 non-null  float64
 5   genres                 46634 non-null  object 
 6   homepage               12260 non-null  object 
 7   id                     46634 non-null  float64
 8   original_language      46634 non-null  object 
 9   original_title         46634 non-null  object 
 10  overview               45609 non-null  object 
 11  popularity             46634 non-null  float64
 12  poster_path            42958 non-null  object 
 13  production_companies   46634 non-null  object 
 14  production_countries   46634 non-null  object 
 15  rel

In [5]:
tmdb_df = tmdb_df.dropna(subset=['certification','revenue'])

## Hypothesis Testing

- $H_0$ (Null Hypothesis): There is no significant difference between revenue and the rating of a movie.
- $H_A$ (Alternative Hypothesis):  There is a significant difference between revenue and the rating of a movie.

###  Determine the correct test to perform.
- Type of Data? numeric
- How many groups/samples? > 2 groups
- Therefore, which test is appropriate? ANOVA

In [6]:
tmdb_df = tmdb_df[tmdb_df.revenue !=0]

In [7]:
# Create groups dictionary.
groups = {}
## Loop through all unique categories
for rating in tmdb_df['certification'].unique():
    ## Get series for group and rename
    data = tmdb_df.loc[tmdb_df['certification']==rating,'revenue'].copy()
    
    # save into the dictionary
    groups[rating] = data
groups.keys()

dict_keys(['PG', 'R', 'G', 'PG-13', 'NR', 'NC-17', 'R ', 'PG-13 '])

In [8]:
## Loop through groups dict
for rating, data in groups.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(data)) > 3
    
    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {rating} group.")

    ## Remove the outiers from data and overwrite the region data in the dict
    data = data.loc[~outliers]
    groups[rating] = data

There were 14 outliers in the PG group.
There were 34 outliers in the R group.
There were 2 outliers in the G group.
There were 25 outliers in the PG-13 group.
There were 3 outliers in the NR group.
There were 0 outliers in the NC-17 group.
There were 0 outliers in the R  group.
There were 0 outliers in the PG-13  group.


In [10]:
## Running normal test on each group and confirming there are >20 in each group

## Save a list with an inner list of column names
norm_results = [['group','n','pval','sig?']]


## loop through group dict
for rating, data in groups.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(data)
    
    ## Append the right info into norm_resutls (as a list)
    norm_results.append([rating,len(data), p, p<.05])
    
    
## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

ValueError: skewtest is not valid with less than 8 samples; 1 samples were given.

- We failed the assumpution for normality for G and NR groups. 

In [None]:
## Use Levene's test for equal variance
result = stats.levene(*groups.values())
print(result)

- We reject the null hypothesis of equal variance.
- We wanted to run ANOVA test, but will need to use Kruskal-Wallis test insead.

In [None]:
## Running Krukal Test for Original Hypothesis
result = stats.kruskal(*groups.values())
print(result)
result.pvalue<.05

- We reject the null hypothesis and support that there is a significant difference between revenue and the rating of a movie.

- $H_0$ (Null Hypothesis): There is no significant difference between revenue and the length of a movie.
- $H_A$ (Alternative Hypothesis):  There is a significant difference between revenue and the length of a movie.

###  Determine the correct test to perform.
- Type of Data? numeric
- How many groups/samples? 2 groups
- Therefore, which test is appropriate? 2 sample t-test

In [None]:
tmdb_df['runtime'].value_counts()

In [None]:
tmdb_df['longer_movie'] = tmdb_df['runtime'] >= 150

In [None]:
tmdb_df['longer_movie'].value_counts

In [None]:
## save list of columns needed for each group
needed_cols = ['longer_movie', 'revenue']

In [None]:
## save male team in separate variable
longer_movie_df = tmdb_df.loc[tmdb_df['longer_movie']==True, needed_cols]
longer_movie_df

In [None]:
## save male team in separate variable
shorter_movie_df = tmdb_df.loc[tmdb_df['longer_movie']==False, needed_cols]
shorter_movie_df

In [None]:
## Make a df just for visualization by concat the groups 
plot_df =  pd.concat([shorter_movie_df, longer_movie_df])
plot_df

In [None]:
## visualize the group means
sns.barplot(data=plot_df, x='longer_movie', y='revenue')

In [None]:
## Saving JUST the numeric col as final group variables
longer_movie_group = longer_movie_df['revenue']
shorter_movie_group = shorter_movie_df['revenue']
longer_movie_group

In [None]:
## Check smoker group for outliers
longer_movie_outliers = np.abs(stats.zscore(longer_movie_group)) > 3

## how many outliers?
longer_movie_outliers.sum()

In [None]:
## Check smoker group for outliers
shorter_movie_outliers = np.abs(stats.zscore(shorter_movie_group)) > 3

## how many outliers?
shorter_movie_outliers.sum()

In [None]:
## remove outliers from female_group
shorter_movie_group = shorter_movie_group.loc[~shorter_movie_outliers]
shorter_movie_group

In [None]:
result = stats.normaltest(longer_movie_group)
print(result.pvalue)
result.pvalue < .05

In [None]:
len(longer_movie_group)

In [None]:
result = stats.normaltest(shorter_movie_group)
result.pvalue < .05

In [None]:
len(shorter_movie_group)

- We fail to reject the null hypothesis that there is no statistical difference between the longer_movie_group and a normal distribution. 
- We reject the null hypothesis and conclude that there is a statistical difference between the shorter_movie_group and a normal distribution.

In [None]:
## Use Levene's test for equal variance
result = stats.levene(longer_movie_group, shorter_movie_group)
print(result)

In [None]:
## Use an if-else to help interpret the p-value
if result.pvalue < 0.5:
    print(f'The groups do NOT have equal variance.')
else:
    print(f'the groups DO have equal variance.')

In [None]:
## run final hypothess test
result = stats.ttest_ind(longer_movie_group, shorter_movie_group, equal_var=False)
print(result)
result.pvalue < 0.05

- We fail to reject the null hypothesis that there is no significant difference between revenue and the length of a movie.