In [1]:
# Packages
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
# Import data
df = pd.read_csv('movieReplicationSet.csv')

For many of the hypothesis tests that follow, note that ratings data cannot be reduced to a sample mean. This affects which hypothesis tests we choose to perform.

### Question 1: Are movies with more ratings rater higher than movies with less ratings?

We choose to perform a Mann Whitney U test, since we are comparing popularity (median number of ratings), not a mean.

In [3]:
# Calculate "popularity"
movies_df = df.iloc[:, 0:400]
popularity = movies_df.count()
median_popularity = np.median(popularity)

# split on popularity
df2 = movies_df.append(popularity, ignore_index=True)
low_pop_mask = (popularity <= median_popularity)
high_pop_mask = (popularity > median_popularity)

low_pop_movies = df2.loc[:, low_pop_mask]
high_pop_movies = df2.loc[:, high_pop_mask]

# Consolidate ratings for each group (low vs high pop)
ratings_low_pop = low_pop_movies.to_numpy()
ratings_high_pop = high_pop_movies.to_numpy()
# Drop NaNs
ratings_low_pop = ratings_low_pop[np.logical_not(np.isnan(ratings_low_pop))]  # len = 22,200
ratings_high_pop= ratings_high_pop[np.logical_not(np.isnan(ratings_high_pop))]  # len = 90,414

# Do mann whitney U test
u1,p1 = stats.mannwhitneyu(ratings_low_pop, ratings_high_pop, alternative='less')
print(u1)
print(p1) 

759942655.5
0.0


Conclusion: We reject the null hypothesis that they are the same, and accept the alternative hypothesis that popular movies are rated higher than less popular movies.

### Question 2: Are movies that are newer rated differently than movies that are older?

In [5]:
# Need to parse years from titles
titles = movies_df.columns
titles = titles.to_series()
titles_and_years = titles.apply(lambda x: x[-5:-1])

# Fix year for 'Rambo: First Blood Part II' 
titles_and_years[6] = '1985' # Year wasn't included in original dataset, so we add it
titles_and_years = titles_and_years.apply(lambda x: int(x))
years = titles_and_years.values

median_year = np.median(years) #1999

df2 = movies_df.append(titles_and_years, ignore_index=True)
old_mask = (years < median_year)
new_mask = (years >= median_year)

old_movies = df2.loc[:, old_mask]
new_movies = df2.loc[:, new_mask]

# Consolidate ratings for each group (low vs high pop)
ratings_old= old_movies.to_numpy()
ratings_new = new_movies.to_numpy()
# Drop NaNs
ratings_old = ratings_old[np.logical_not(np.isnan(ratings_old))]  # len = 46,721
ratings_new= ratings_new[np.logical_not(np.isnan(ratings_new))]  # len = 65,893

# Do mann whitney U test
u2,p2 = stats.mannwhitneyu(ratings_old, ratings_new)
print(u2)
print(p2)

1515524791.0
7.167226926012573e-06


We obtain a p value of 7.17e-06. This is less than our alpha level, so we reject the null hypothesis that the ratings are the same and conclude that newer movies are rated differently than older movies

### Question 3: Is enjoyment of ‘Shrek (2001)’ gendered, i.e. do male and female viewers rate it differently

In [6]:
gender_id = df.iloc[:, 474] # 1 = female, 2 = male, 3 = self-described
mask_f = (gender_id == 1) # mask  to filer f vs m ratings
mask_m = (gender_id == 2)

shrek = movies_df.iloc[:, 57]
shrek_ratings_f = shrek[mask_f]
shrek_ratings_m = shrek[mask_m]

# Drop NaNs
shrek_ratings_f = shrek_ratings_f[np.logical_not(np.isnan(shrek_ratings_f))] # len = 712
shrek_ratings_m = shrek_ratings_m[np.logical_not(np.isnan(shrek_ratings_m))] # len = 228

# Do mann whitney U test
u3,p3 = stats.mannwhitneyu(shrek_ratings_f, shrek_ratings_m)
print(u3) 
print(p3) 

90219.5
0.010031609873905622


When comparing the ratings of ‘Shrek (2001)’ among male vs. female viewers, we obtain a p value of 0.01. This is greater than our alpha level, thus we fail to reject the null hypothesis and cannot conclude that the enjoyment of ‘Shrek (2001)’ is gendered.

### Question 4: What proportion of movies are rated differently by male and female viewers?

In [9]:
# loop over all movies, do above for each and count how many we reject null for vs don't
num_rated_diff = 0

for i in range(400):
    # Get ith movie
    movie = movies_df.iloc[:, i]
    
    # Separate ratings by gender
    movie_ratings_f = movie[mask_f]
    movie_ratings_m = movie[mask_m]
    
    # Drop NaNs
    movie_ratings_f = movie_ratings_f[np.logical_not(np.isnan(movie_ratings_f))]
    movie_ratings_m = movie_ratings_m[np.logical_not(np.isnan(movie_ratings_m))]
    
    # Do mann whitney U test
    u,p = stats.mannwhitneyu(movie_ratings_f, movie_ratings_m)
    
    if p < 0.005:
        num_rated_diff += 1
    
    
print(num_rated_diff) # 50   -> proportion = 50 / 400 = 0.125
print('proportion:', num_rated_diff / 400)

50
proportion: 0.125


When comparing the ratings of all 400 movies among male vs. female viewers, we obtain that 50 movies achieved a p value less than our alpha level. Thus, we can conclude that 0.125 is the proportion of movies rated differently by male and female viewers.

### Question 5:  Do people who are only children enjoy ‘The Lion King (1994)’ more than people with siblings?

In [10]:
sibling_status = df.iloc[:, 475] # 1 = yes, 0 = no, -1 = no response
mask_OC= (sibling_status == 1) # mask  to filer f vs m ratings
mask_sibs = (sibling_status == 0)

LK = movies_df.iloc[:, 220]
LK_ratings_OC = LK[mask_OC]
LK_ratings_sibs = LK[mask_sibs]

# Drop NaNs
LK_ratings_OC = LK_ratings_OC[np.logical_not(np.isnan(LK_ratings_OC))]
LK_ratings_sibs = LK_ratings_sibs[np.logical_not(np.isnan(LK_ratings_sibs))]
    
# Do mann whitney U test
u5,p5 = stats.mannwhitneyu(LK_ratings_OC, LK_ratings_sibs, alternative='greater')
print(u5) 
print(p5) 

52929.0
0.978419092554931


We obtain a p value of 0.9784, which is greater than our alpha level. Thus, we fail to reject the null hypothesis and cannot conclude that people who are only children enjoy ‘The Lion King (1994)’ more than people with siblings.

### Question 6: What proportion of movies exhibit an “only child effect”, i.e. are rated different by viewers with siblings vs. those without?

In [12]:
num_rated_diff = 0
diff_rated_movies = []

for i in range(400):
    # Get ith movie
    movie = movies_df.iloc[:, i]
    
    # Separate ratings by gender
    movie_ratings_OC = movie[mask_OC]
    movie_ratings_sibs = movie[mask_sibs]
    
    # Drop NaNs
    movie_ratings_OC = movie_ratings_OC[np.logical_not(np.isnan(movie_ratings_OC))]
    movie_ratings_sibs = movie_ratings_sibs[np.logical_not(np.isnan(movie_ratings_sibs))]
    
    # Do mann whitney U test
    u,p = stats.mannwhitneyu(movie_ratings_OC, movie_ratings_sibs)
    
    if p < 0.005:
        num_rated_diff += 1
        diff_rated_movies.append(movies_df.columns[i])
    
print(num_rated_diff) # 7
print('proportion:', num_rated_diff/400)
print(diff_rated_movies)

7
proportion: 0.0175
['Billy Madison (1995)', 'The Blue Lagoon (1980)', 'Happy Gilmore (1996)', 'American Pie (1999)', 'Star Wars: Episode VI - The Return of the Jedi (1983)', 'FeardotCom (2002)', 'Captain America: Civil War (2016)']


Seven movies achieve a p value less than our alpha level. Thus, we conclude that 0.0175 is the proportion of movies with an “only child effect”.

### Question 7: Do people who like to watch movies socially enjoy ‘The Wolf of Wall Street (2013)’ more than those who prefer to watch them alone?

In [13]:
alone = df.iloc[:, 476] # 1 = yes, 0 = no, -1 = no response
mask_alone = (alone == 1) 
mask_social = (alone == 0)


WWS = movies_df.iloc[:, 357]
WWS_ratings_alone = WWS[mask_alone]
WWS_ratings_social = WWS[mask_social]

# Drop NaNs
WWS_ratings_alone = WWS_ratings_alone[np.logical_not(np.isnan(WWS_ratings_alone))]
WWS_ratings_social = WWS_ratings_social[np.logical_not(np.isnan(WWS_ratings_social))]
    
# Do mann whitney U test
u7,p7 = stats.mannwhitneyu(WWS_ratings_social, WWS_ratings_alone, alternative='greater')
print(u7) 
print(p7)

49303.5
0.9436657996253056


When comparing the ratings of ‘The Wolf of Wall Street (2013)’ among people who enjoy watching movies socially vs. alone, we obtain a p value of 0.94367. This is greater than our alpha level, thus we fail to reject the null hypothesis and cannot conclude that people who like to watch movies socially enjoy ‘The Wolf of Wall Street (2013)’ more than people who prefer to watch them alone.

### Question 8: What proportion of movies exhibit such a “social watching” effect?

In [17]:
num_rated_diff = 0
diff_rated_movies = []

for i in range(400):
    # Get ith movie
    movie = movies_df.iloc[:, i]
    
    # Separate ratings by gender
    movie_ratings_alone = movie[mask_alone]
    movie_ratings_social = movie[mask_social]
    
    # Drop NaNs
    movie_ratings_alone = movie_ratings_alone[np.logical_not(np.isnan(movie_ratings_alone))]
    movie_ratings_social = movie_ratings_social[np.logical_not(np.isnan(movie_ratings_social))]
    
    # Do mann whitney U test
    u,p = stats.mannwhitneyu(movie_ratings_social, movie_ratings_alone, alternative='greater')
    
    if p < 0.005:
        num_rated_diff += 1
        diff_rated_movies.append(movies_df.columns[i])
    
print(num_rated_diff) # 6  
print(num_rated_diff/400)
print(diff_rated_movies)

6
0.015
['North (1994)', 'Shrek 2 (2004)', 'The Avengers (2012)', 'Spider-Man (2002)', 'The Transporter (2002)', 'Captain America: Civil War (2016)']


When comparing the ratings of all 400 movies among people who enjoy watching movies socially vs. alone, we obtain that six movies achieve a p value less than our alpha level. Thus, we can conclude that 0.015 is the proportion of movies that exhibit a “social watching effect”.

### Question 9: Is the ratings distribution of ‘Home Alone (1990)’ different than that of ‘Finding Nemo (2003)’?

We perform a Kolmogorov-Smirnov hypothesis test, since our ratings data cannot be reduced to sample means, and we wish to compare distributions.

In [18]:
HA = movies_df.iloc[:, 285] # 857  non-NaN
Nemo = movies_df.iloc[:, 138] #1014  non-NaN

# Row wise pruning
temp = np.array([np.isnan(HA),np.isnan(Nemo)],dtype=bool)
temp2 = temp*1 # convert boolean to int
temp2 = sum(temp2) # take sum of each participant
missingData = np.where(temp2>0) # find participants with missing data
HA = np.delete(HA.values,missingData) # delete missing data from array
Nemo = np.delete(Nemo.values,missingData) # delete missing data from array


k9, p9 = stats.kstest(HA, Nemo)
print(k9) 
print(p9)

0.16790123456790124
2.2038507937682687e-10


we obtain a p value of 2.2e-10. This is less than our alpha level, thus we reject the null hypothesis, and conclude that the ratings distribution of ‘Home Alone (1990)’ is different than the ratings distribution of ‘Finding Nemo (2003)’.

### Question 10: There are ratings on movies from several franchises ([‘Star Wars’, ‘Harry Potter’, ‘The Matrix’, ‘Indiana Jones’, ‘Jurassic Park’, ‘Pirates of the Caribbean’, ‘Toy Story’, ‘Batman’]) in this dataset. How many of these are of inconsistent quality, as experienced by viewers?

We choose to perform Kruskal-Wallis hypothesis tests, since our data cannot be reduced to sample means, are we have more than 2 groups. 

In [21]:
franchises = ['Star Wars', 'Harry Potter', 'The Matrix', 'Indiana Jones', 'Jurassic Park',
              'Pirates of the Caribbean', 'Toy Story', 'Batman']

titles = movies_df.columns
inconsistent = []
p_vals = {}

for franchise in franchises:
    # get all moves in franchise
    movies = titles[titles.str.contains(franchise)]
    
    all_ratings = [] # list of Series'
    all_nans = []
    
    # get ratings for each movie in franchise
    for movie in movies:
        ratings = movies_df[movie]
        all_ratings.append(ratings)
        all_nans.append(np.isnan(ratings))
        
    # row wise removal
    temp = np.array(all_nans,dtype=bool)
    temp2 = temp*1 # convert boolean to int
    temp2 = sum(temp2) # take sum of each participant
    missingData = np.where(temp2>0) # find participants with missing data
    # delete for each movies
    all_ratings_cleaned = []
    for i in range(len(all_ratings)):
        movie = np.delete(all_ratings[i].values,missingData) # delete missing data from array
        all_ratings_cleaned.append(movie)
        
    # Kruskal-Wallis test
    h,p = stats.kruskal(*all_ratings_cleaned)
    
    print(franchise, end=': ')
    print(p)
    p_vals[franchise] = p
    
    # add to list if franchise is inconsistent
    if p < 0.005:
        inconsistent.append(franchise)

Star Wars: 6.940162236984522e-40
Harry Potter: 0.11790622831256074
The Matrix: 1.7537323830838066e-09
Indiana Jones: 1.020118354785894e-11
Jurassic Park: 1.8492328391686058e-11
Pirates of the Caribbean: 0.035792727694248905
Toy Story: 7.902234665149812e-06
Batman: 4.1380499020034183e-19


We obtain that six franchises achieve a p value less than our alpha level. These are 'Star Wars', 'The Matrix', 'Indiana Jones', 'Jurassic Park', 'Toy Story', and 'Batman'. Therefore, we conclude that these six franchises are of inconsistent quality, as experienced by the viewers.

### Question 11:  Is the enjoyment of the movies in the ‘Star Wars’ franchise gendered?

In [23]:
star_wars_movies = titles[titles.str.contains('Star Wars')]

# we have gender masks from before

gendered = []
p_vals = {}

for movie in star_wars_movies:
    ratings = movies_df[movie]
    ratings_f = ratings[mask_f]
    ratings_m = ratings[mask_m]

    # Drop NaNs
    ratings_f = ratings_f[np.logical_not(np.isnan(ratings_f))] # len = 712
    ratings_m = ratings_m[np.logical_not(np.isnan(ratings_m))] # len = 228

    # Do mann whitney U test
    u,p = stats.mannwhitneyu(ratings_f, ratings_m)
    p_vals[movie] = p
    
    if p < 0.005:
        gendered.append(movie)
        
print(gendered)

['Star Wars: Episode IV - A New Hope (1977)']


Of the six ‘Star Wars’ movies in our dataset, only ‘Star Wars: Episode IV - A New Hope (1977)’ obtains a p value less than our alpha value of 0.005, at 0.0017. Thus, we can conclude that this is the only Star Wars movie that male and female viewers rate differently.