In [27]:
# imports
import pandas as pd
import numpy as np
import scipy.stats as stats
from sqlalchemy import create_engine

# Hypothesis Testing

## Q1: Does the MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?

### State Null and Alternate Hypothesis

- $H_0$ (Null Hypothesis): There is no difference in the revenue generated for movies with different MPAA ratings.
- $H_A$ (Alternative Hypothesis): There is a significant difference in the revenue generated for movies with different MPAA ratings.

Based on the [Choosing the Right Hypothesis Test Guide on Coding Dojo](https://login.codingdojo.com/m/720/16269/120441):
   -  The data is numeric (revenue)
   -  There are more than 2 samples/groups
   -  The appropriate test to perform is therefore the ANOVA test

According to the guide, the ANOVA Test has the following assumptions:
- No significant outliers
- Normality
- Equal Variance

Significance level (Alpha) = 0.05

### Extract data from MYSQL

In [4]:
connection = "mysql+pymysql://root:root@localhost/movies"
engine = create_engine(connection)

In [45]:
q = """SELECT revenue, certification FROM tmdb_data
        WHERE revenue > 0
        AND certification!='None' """
df_rating = pd.read_sql(q, engine)
df_rating.head()

Unnamed: 0,revenue,certification
0,76019000.0,PG-13
1,5271670.0,PG-13
2,14204600.0,PG
3,14904.0,R
4,224835000.0,G


In [47]:
df_rating['certification'].value_counts()

R        169
PG-13    126
PG        32
G         15
NR        14
Name: certification, dtype: int64

In [48]:
# create groups dictionary
groups ={}

# loop through unique ratings and save data in dictionary with rating as the key and revenue as the values
for rating in df_rating['certification'].unique():
    temp = df_rating.loc[df_rating['certification']== rating, 'revenue']
    groups[rating] = temp

groups.keys()

dict_keys(['PG-13', 'PG', 'R', 'G', 'NR'])

### Check Assumption - No sig outliers

In [49]:
# remove significant outliers

for i, data in groups.items():
    # find outliers
    outliers = np.abs(stats.zscore(data)) > 3

    # prints number of outliers per group
    print(f'{i} = {outliers.sum()} outliers')

    # remove outliers
    data = data.loc[~outliers]
    groups[i] = data

PG-13 = 2 outliers
PG = 1 outliers
R = 4 outliers
G = 1 outliers
NR = 1 outliers


### Check Assumption - Normality

In [50]:
# normality test on each group
norm_results = {}

for i, data in groups.items():
    stat, p = stats.normaltest(data)

    # save p value, test statisticts and size of group
    norm_results[i] = {'n': len(data),
                       'p': p,
                       'test stat':stat}



In [51]:
# convert dict to df and Transform
norm_results_df = pd.DataFrame(norm_results).T
norm_results_df

Unnamed: 0,n,p,test stat
PG-13,124.0,4.056993e-10,43.250818
PG,31.0,4.467548e-05,20.032171
R,165.0,2.710511e-15,67.083279
G,14.0,0.3289276,2.223835
NR,13.0,0.1512551,3.777575


In [52]:
# check significance of value
norm_results_df['sig'] = norm_results_df['p'] < 0.05
norm_results_df

Unnamed: 0,n,p,test stat,sig
PG-13,124.0,4.056993e-10,43.250818,True
PG,31.0,4.467548e-05,20.032171,True
R,165.0,2.710511e-15,67.083279,True
G,14.0,0.3289276,2.223835,False
NR,13.0,0.1512551,3.777575,False


Commentary

### Check Assumption - Variance

In [53]:
# testing assumptions of equal variance
stats.levene(*groups.values())

LeveneResult(statistic=8.992741297352378, pvalue=6.463378369751162e-07)