#### Overview

1.  How does MPAA rating of a movie (G/PG/PG-13/R) affect how much revenue the movie generates?
2. Is there a difference in revenue between 2018 and 2020?
3. What  is the difference in revenue for movie length of short vs. long?
4. Are certain genres more popular based on revenue?

# Create project

In [1]:
# basic imports
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

# SQL
from sqlalchemy import create_engine
import pymysql
pymysql.install_as_MySQLdb()

# Stats
import scipy.stats as stats
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd


# settings
import warnings
warnings.filterwarnings("ignore")
pd.options.display.float_format = '{:,.2f}'.format

# Load Data and Process

#### Several step need to be taken to clean and prepare the data

-  for the first hypothesis drop movies with low counts in certification categories
- for the second hypothesis  the movies need to be grouped

## First Hypothesis prep

In [2]:
# load data of years 2010-2020
year_2010 = pd.read_csv('API_Data/final_tmdb_data_2010.csv.gz', low_memory = False)
year_2011 = pd.read_csv('API_Data/final_tmdb_data_2011.csv.gz', low_memory = False, lineterminator='\n')
year_2012 = pd.read_csv('API_Data/final_tmdb_data_2012.csv.gz', low_memory = False, lineterminator='\n')
year_2013 = pd.read_csv('API_Data/final_tmdb_data_2013.csv.gz', low_memory = False)
year_2014 = pd.read_csv('API_Data/final_tmdb_data_2014.csv.gz', low_memory = False, lineterminator='\n')
year_2015 = pd.read_csv('API_Data/final_tmdb_data_2015.csv.gz', low_memory = False, lineterminator='\n')
year_2016 = pd.read_csv('API_Data/final_tmdb_data_2016.csv.gz', low_memory = False)
year_2017 = pd.read_csv('API_Data/final_tmdb_data_2017.csv.gz', low_memory = False, lineterminator='\n')
year_2018 = pd.read_csv('API_Data/final_tmdb_data_2018.csv.gz', low_memory = False, lineterminator='\n')
year_2019 = pd.read_csv('API_Data/final_tmdb_data_2019.csv.gz', low_memory = False, lineterminator='\n')
year_2020 = pd.read_csv('API_Data/final_tmdb_data_2020.csv.gz', low_memory = False, lineterminator='\n')


In [3]:
#create new dataframe
tmdb_df = pd.concat([ year_2010,year_2011, year_2012, year_2013,year_2014, 
                     year_2015, year_2016, year_2017, 
                     year_2018, year_2019, year_2020])
tmdb_df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0312305,0.0,,,0.0,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",http://www.qqthemovie.com/,23738.0,en,Quantum Quest: A Cassini Space Odyssey,...,0.0,45.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Quantum Quest: A Cassini Space Odyssey,0.0,8.4,7.0,
2,tt0326965,0.0,/xt2klJdKCVGXcoBGQrGfAS0aGDE.jpg,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 9648, ...",http://www.inmysleep.com,40048.0,en,In My Sleep,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Sleepwalking Can Be Deadly,In My Sleep,0.0,5.5,31.0,PG-13
3,tt0331312,0.0,,,0.0,[],,214026.0,en,This Wretched Life,...,0.0,0.0,[],Released,,This Wretched Life,0.0,5.0,1.0,
4,tt0393049,0.0,/gc9FN5zohhzCt05RkejQIIPLtBl.jpg,,300000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,324352.0,en,Anderson's Cross,...,0.0,98.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Sometimes the boy next door is more than the b...,Anderson's Cross,0.0,4.0,5.0,


In [4]:
# look at info
tmdb_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 40570 entries, 0 to 3717
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                40570 non-null  object 
 1   adult                  40559 non-null  float64
 2   backdrop_path          26186 non-null  object 
 3   belongs_to_collection  2384 non-null   object 
 4   budget                 40559 non-null  float64
 5   genres                 40559 non-null  object 
 6   homepage               11056 non-null  object 
 7   id                     40559 non-null  float64
 8   original_language      40559 non-null  object 
 9   original_title         40559 non-null  object 
 10  overview               39638 non-null  object 
 11  popularity             40559 non-null  float64
 12  poster_path            37225 non-null  object 
 13  production_companies   40559 non-null  object 
 14  production_countries   40559 non-null  object 
 15  rel

In [5]:
# look at  counts
tmdb_df['certification'].value_counts()


R          3307
NR         2404
PG-13      1859
PG          860
G           287
NC-17       111
Unrated       4
R             1
PG-13         1
10            1
Name: certification, dtype: int64

In [6]:
# fix low values
tmdb_df['certification'] = tmdb_df['certification'].replace({'R ': 'R',
                                                            'PG-13 ': 'PG-13'})
tmdb_df['certification'].value_counts()

R          3308
NR         2404
PG-13      1860
PG          860
G           287
NC-17       111
Unrated       4
10            1
Name: certification, dtype: int64

In [7]:
#derop null values
tmdb_df = tmdb_df[tmdb_df.revenue !=0]



In [8]:
# explore the  missing data
num_missing = tmdb_df['certification'].isna().sum()

total_rows = tmdb_df.shape[0]

percent_missing = num_missing / total_rows
print(f'{percent_missing:.2f}% of the data in the certification column is missing')

0.37% of the data in the certification column is missing


## Second Hypothesis prep

In [9]:
#read the data
df_2018 = year_2018


In [10]:
df_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4520 entries, 0 to 4519
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                4520 non-null   object 
 1   adult                  4519 non-null   float64
 2   backdrop_path          3450 non-null   object 
 3   belongs_to_collection  227 non-null    object 
 4   budget                 4519 non-null   float64
 5   genres                 4519 non-null   object 
 6   homepage               1221 non-null   object 
 7   id                     4519 non-null   float64
 8   original_language      4519 non-null   object 
 9   original_title         4519 non-null   object 
 10  overview               4422 non-null   object 
 11  popularity             4519 non-null   float64
 12  poster_path            4411 non-null   object 
 13  production_companies   4519 non-null   object 
 14  production_countries   4519 non-null   object 
 15  rele

In [11]:
#loading 2021 data, errors in reading 2020 data
df_2020= year_2020

In [12]:
df_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3718 entries, 0 to 3717
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                3718 non-null   object 
 1   adult                  3717 non-null   float64
 2   backdrop_path          2704 non-null   object 
 3   belongs_to_collection  161 non-null    object 
 4   budget                 3717 non-null   float64
 5   genres                 3717 non-null   object 
 6   homepage               1058 non-null   object 
 7   id                     3717 non-null   float64
 8   original_language      3717 non-null   object 
 9   original_title         3717 non-null   object 
 10  overview               3658 non-null   object 
 11  popularity             3717 non-null   float64
 12  poster_path            3608 non-null   object 
 13  production_companies   3717 non-null   object 
 14  production_countries   3717 non-null   object 
 15  rele

In [13]:
# create one dataframe
combinedyears = pd.concat([df_2018, df_2020])

In [14]:
#seperate out years
combinedyears['release_date'] = pd.to_datetime(combinedyears['release_date'])

In [15]:
#create year column
combinedyears['year'] = combinedyears['release_date'].dt.year
combinedyears.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8238 entries, 0 to 3717
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   imdb_id                8238 non-null   object        
 1   adult                  8236 non-null   float64       
 2   backdrop_path          6154 non-null   object        
 3   belongs_to_collection  388 non-null    object        
 4   budget                 8236 non-null   float64       
 5   genres                 8236 non-null   object        
 6   homepage               2279 non-null   object        
 7   id                     8236 non-null   float64       
 8   original_language      8236 non-null   object        
 9   original_title         8236 non-null   object        
 10  overview               8080 non-null   object        
 11  popularity             8236 non-null   float64       
 12  poster_path            8019 non-null   object        
 13  pro

In [16]:
# drop null values
combinedyears = combinedyears.dropna(subset=['year','revenue'])

In [17]:
#create 2001 column
combinedyears['year_2020'] = combinedyears['year'] == 2020

In [18]:
#check values
combinedyears['year_2020'].value_counts()

False    4871
True     3186
Name: year_2020, dtype: int64

In [19]:
## save list of columns needed for each group
needed_cols = ['year_2020', 'revenue']

In [20]:
## save year_2021 in separate variable
year_2020_df = combinedyears.loc[combinedyears['year_2020']==True, needed_cols]
year_2020_df

Unnamed: 0,year_2020,revenue
82,True,0.00
114,True,0.00
222,True,0.00
340,True,0.00
424,True,0.00
...,...,...
3713,True,0.00
3714,True,0.00
3715,True,0.00
3716,True,0.00


In [21]:
## save year_2018 in separate variable
year_2018_df = combinedyears.loc[combinedyears['year_2020']==False, needed_cols]
year_2018_df

Unnamed: 0,year_2020,revenue
1,False,0.00
2,False,0.00
3,False,0.00
4,False,0.00
5,False,0.00
...,...,...
3670,False,0.00
3687,False,0.00
3693,False,0.00
3703,False,6435260.00


In [22]:
## save list of columns needed for each group
needed_cols = ['year_2021', 'revenue']

In [23]:
## Saving JUST the numeric col as final group variables
year_2020_group = year_2020_df['revenue']
year_2018_group = year_2018_df['revenue']
year_2020_group

82     0.00
114    0.00
222    0.00
340    0.00
424    0.00
       ... 
3713   0.00
3714   0.00
3715   0.00
3716   0.00
3717   0.00
Name: revenue, Length: 3186, dtype: float64

## Third Hypothesis prep

In [24]:
#explore runtime
tmdb_df['runtime'].describe()

count   4,376.00
mean      107.28
std        21.07
min         0.00
25%        94.00
50%       104.00
75%       118.00
max       225.00
Name: runtime, dtype: float64

In [25]:
# look at runtime
tmdb_df['runtime'].value_counts()

100.00    163
90.00     159
105.00    137
97.00     119
95.00     117
         ... 
163.00      1
177.00      1
186.00      1
52.00       1
57.00       1
Name: runtime, Length: 135, dtype: int64

In [26]:
# create groups
long_film_df = tmdb_df.loc[tmdb_df['runtime'] > 150].copy()
short_film_df = tmdb_df.loc[tmdb_df['runtime'] < 90].copy()

In [27]:
display(long_film_df.info(), short_film_df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 147 entries, 267 to 3534
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                147 non-null    object 
 1   adult                  147 non-null    float64
 2   backdrop_path          141 non-null    object 
 3   belongs_to_collection  23 non-null     object 
 4   budget                 147 non-null    float64
 5   genres                 147 non-null    object 
 6   homepage               54 non-null     object 
 7   id                     147 non-null    float64
 8   original_language      147 non-null    object 
 9   original_title         147 non-null    object 
 10  overview               147 non-null    object 
 11  popularity             147 non-null    float64
 12  poster_path            145 non-null    object 
 13  production_companies   147 non-null    object 
 14  production_countries   147 non-null    object 
 15  rel

None

None

In [28]:
longfilm_runtime = long_film_df['runtime']
shortfilm_runtime = short_film_df['runtime']

## Fourth Hypothesis Prep

In [29]:
## Change username and password to match your personal MySQL Server settings
username = "root"
password = "Ahimsa81981182!"

movie = f'mysql+pymysql://{username}:{password}@localhost/movies'
engine = create_engine(movie)

In [30]:
#check genres
q = '''SELECT * FROM genres limit 5;'''
pd.read_sql(q, engine)

Unnamed: 0,Genre_name,Genre_id
0,Action,0
1,Adult,1
2,Adventure,2
3,Animation,3
4,Biography,4


In [31]:
#check genres
q = '''SELECT * FROM title_genres limit 5;'''
pd.read_sql(q, engine)

Unnamed: 0,tconst,genre_id
0,tt0035423,5
1,tt0035423,9
2,tt0035423,18
3,tt0043139,7
4,tt0043139,11


In [32]:
#check genres
q = '''SELECT * FROM tmbd_data limit 5;'''
pd.read_sql(q, engine)

Unnamed: 0,imdb_id,budget,revenue,certification
0,tt0035423,48000000.0,76019000.0,PG-13
1,tt0113026,10000000.0,0.0,
2,tt0113092,0.0,0.0,
3,tt0114447,0.0,0.0,
4,tt0116391,0.0,0.0,


In [33]:
# Use an SQL query to create a dataframe
q = """
SELECT genres.Genre_name, tmbd_data.revenue
FROM tmbd_data
JOIN title_genres ON tmbd_data.imdb_id = title_genres.tconst
JOIN genres ON title_genres.genre_id = genres.Genre_id
WHERE tmbd_data.revenue > 0
;"""
df = pd.read_sql(q, engine)

In [34]:
# Display the first (5) rows
df.head()

Unnamed: 0,Genre_name,revenue
0,Comedy,76019000.0
1,Fantasy,76019000.0
2,Romance,76019000.0
3,Drama,5271670.0
4,Music,5271670.0


In [35]:
# Display the info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1142 entries, 0 to 1141
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Genre_name  1142 non-null   object 
 1   revenue     1142 non-null   float64
dtypes: float64(1), object(1)
memory usage: 18.0+ KB


In [36]:
# Display the unique values and their counts for this column
df['Genre_name'].value_counts()

Drama        249
Comedy       173
Action       107
Romance      101
Crime        101
Adventure     75
Thriller      67
Mystery       51
Fantasy       39
Horror        37
Sci-Fi        30
Animation     28
Biography     19
Family        18
Music         13
Sport         13
Musical        8
History        7
War            5
Western        1
Name: Genre_name, dtype: int64

In [37]:
# Disclude  groups as the value count is < 20
df = df[df['Genre_name'] != 'Western']
df = df[df['Genre_name'] != 'War']
df = df[df['Genre_name'] != 'History']
df = df[df['Genre_name'] != 'Musical']
df = df[df['Genre_name'] != 'Sport']
df = df[df['Genre_name'] != 'Music']
df = df[df['Genre_name'] != 'Family']
df = df[df['Genre_name'] != 'Biography']



In [38]:
# Create a dictionary with genre_name as the key and revenue as the value
groups = {}
for genre_name in df['Genre_name'].unique():
    temp_df = df.loc[df['Genre_name'] == genre_name, 'revenue']
    groups[genre_name] = temp_df
groups.keys()

dict_keys(['Comedy', 'Fantasy', 'Romance', 'Drama', 'Crime', 'Thriller', 'Adventure', 'Animation', 'Horror', 'Mystery', 'Action', 'Sci-Fi'])

In [39]:
# Create a function to evaluate the p=value of a hypothesis test
def interpret_pvalue(ho, ha, alpha=.05):
    ho_desc = 'No statistical significance exists. The null hypothesis was not rejected.'
    ha_desc = 'A statistical significance exists. The null hypothesis is rejected and the alternative is supported that..'
    if pvalue < alpha:
        print(f'The p-value for the test was {pvalue}')
        print(f'It was < the alpha value of {alpha}, so')
        print(ha_desc)
        print(ha)
    else:
        print(f'The p-value for the test was {pvalue}')
        print(f'It was > the alpha value of {alpha}, so')
        print(ho_desc)
        print(ho)

#  Hypothesis Testing


### First Hypothesis
- Do films with different ratings earn different revenue?



- $H_0$ (Null Hypothesis): Movies have the same revenue in all MPAA ratings
- $H_A$ (Alternative Hypothesis): Movies with different MPAA ratings have different revenue.

Possible question: How much is the differences using linear regression
  -  For this question rating will need to be grouped. 


### Check Assumptions for ANOVA
- normality
- equal variance
- outliers

## Test types: ANOVA (rejected) and Kruskall

In [40]:
# Create groups dictionary.
groups = {}
## Loop through all unique categories
for rating in tmdb_df['certification'].unique():
    ## Get series for group and rename
    data = tmdb_df.loc[tmdb_df['certification']==rating,'revenue'].copy()
    
    # save into the dictionary
    groups[rating] = data
groups.keys()

dict_keys([nan, 'PG', 'R', 'PG-13', 'G', 'NR', 'NC-17'])

### Normality

In [41]:
## Running normal test on each group and confirming there are >20 in each group
norm_results = {}
for i, data in groups.items():
    stat, p = stats.normaltest(data)
    ## save the p val, test statistic, and the size of the group
    norm_results[i] = {'n': len(data),
                             'p':p,
                             'test stat':stat,}
## convert to a dataframe
norm_results_df = pd.DataFrame(norm_results).T
norm_results_df

ValueError: skewtest is not valid with less than 8 samples; 0 samples were given.

In [None]:

## Save a list with an inner list of column names
norm_results = [['group','n','pval','sig?']]


## loop through group dict
for rating, data in groups.items():
    ## calculate normaltest results
    stat, p = stats.normaltest(data)
    
    ## Append the right info into norm_resutls (as a list)
    norm_results.append([rating,len(data), p, p<.05])
    
    
## Make norm_results a dataframe (first row is columns, everything else data)
normal_results = pd.DataFrame(norm_results[1:], columns = norm_results[0])
normal_results

#### The data  failed the assumpution for normality for G, PG, PG-13, R, and NR groups, but these groups are large enough that we can safely disregard the assumption of normality.

### Check for Outliers

In [None]:
## Loop through groups dict
for rating, data in groups.items():

    ## determine if there are any outliers
    outliers = np.abs(stats.zscore(data)) > 3
    
    ## print a statement about how many outliers for which group name
    print(f"There were {outliers.sum()} outliers in the {rating} group.")

    ## Remove the outiers from data and overwrite the rating data in the dict
    data = data.loc[~outliers]
    groups[rating] = data

### Equal Variance

In [None]:
# Hypotheses
ho = "All input samples are from groups with equal variances."
ha = "All input samples are not from groups with equal variances."
# Run the test
statistic, pvalue = stats.levene(*groups.values())
# Display the interpretation
interpret_pvalue(ho, ha, alpha=.05)  

In [None]:
## Use Levene's test for equal variance
result = stats.levene(*groups.values())
print(result)

In [None]:
## Use an if-else to help interpret the p-value
if result.pvalue < 0.5:
    print(f'The groups do NOT have equal variance.')
else:
    print(f'the groups DO have equal variance.')

- The null hypothesis is rejected the null hypothesis of equal variance.
- The ANOVA test is not appropriate, Kruskal-Wallis test is the correct test.

###  Run Kruskall Wallace test

In [None]:
## Running Krukal Test for Original Hypothesis
result = stats.kruskal(*groups.values())
print(result)
result.pvalue<.05

### Interpretation
- The p value is less 0.05. From this data the null hypothesis is rejected. There is a difference between revenue in the different MPAA ratings

### Visual  for First Hypothesis

In [None]:
#visualize the data
ax= sns.barplot(data = tmdb_df, x='certification',y='revenue', errorbar=None);
ax.set_title("Revenue by Rating");

In [None]:
#order=df.groupby('Genre')['Rating'].agg('mean').sort_values(ascending=False)
order = tmdb_df.groupby(["certification"])["revenue"].mean().sort_values(ascending=False).index
ax= sns.barplot(data = tmdb_df, x='certification',y='revenue', order=order,
                palette='plasma',errorbar=None);
ax.set_title("Revenue by Rating (2010-2020)")

plt.xlabel('Rating')
plt.ylabel('Revenue', fontfamily='Arial Rounded MT Bold', fontsize = 15)
plt.ticklabel_format(style='plain', axis='y')


ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# setting y axis to display in millions
from matplotlib.ticker import FuncFormatter
def hundred_k(x, pos):
    return f'${x*1e-6:,.0f} M'
price_fmt_100k = FuncFormatter(hundred_k)
ax.yaxis.set_major_formatter(price_fmt_100k)
ax.grid(axis='y', alpha=.3, ls='--') 

plt.savefig('Images/Bar Plot for Revenue by Rating', bbox_inches='tight')


## Second Hypothesis
- Is there a difference in revenue of 2018 and 2020 films?

#### Hypothesis
- $H_0$ (Null Hypothesis): Movies have the same revenue in 2018 and 2020. 
- $H_A$ (Alternative Hypothesis): Movies have  different revenue in 2018 and 2020. 
#### Test type: Independent T test or 2 sample
- numeric
- two groups
#### Assumptions
- outliers
- normality ( discuss sample size > 15)
- equal variance

### Outliers

In [None]:
## Check year_2021_group for outliers
year_2020_outliers = np.abs(stats.zscore(year_2020_group)) > 3

year_2020_outliers.sum()

In [None]:
## remove 2020 outliers  
year_2020_group = year_2020_group.loc[~year_2020_outliers]

In [None]:
## Check year_2018 group for outliers
year_2018_outliers = np.abs(stats.zscore(year_2018_group)) > 3

year_2018_outliers.sum()

In [None]:
## remove 2018 outliers  
year_2018_group = year_2018_group.loc[~year_2018_outliers]

### Test for Normality

In [None]:
#check groups size
len(year_2020_group)

In [None]:
#check groups size
len(year_2018_group)

Both groups passed normal distribution since p values are less than 0.05 and the groups are >15.



### Equal Variance

In [None]:
#run test
result = stats.levene(year_2021_group, year_2018_group)
print(result)
print(result.pvalue<.05)

The groups do not have equal variance so will set equal variance = false in the Welch's T test.


### Run Test

In [None]:
# run test
result = stats.ttest_ind(year_2021_group, year_2018_group, equal_var=False)
print(result)
result.pvalue < .05

### Interpretation 

- The Welch's T-Test return a p-value > .05. Based on this result, the data rejects the null hypothesis.
- There is  a significant difference between 2020 and 2018 revenue.
- Double check with the actual means of our final groups

In [None]:
print(f'The average revenue for 2020 movies was {year_2020_group.mean(): .2f}')
print(f'There average revenue for 2018 movies was {year_2018_group.mean(): .2f}')

### Visual

In [None]:
# # create dataframe without outliers
plot_df = pd.concat([year_2020_df.loc[~year_2020_outliers],
                    year_2018_df.loc[~year_2018_outliers]], axis=0)
plot_df

In [None]:
ax=sns.barplot(data=plot_df, x='year_2020', y= 'revenue', errorbar=None);
ax.set_title("Revenue Comparison (2018-2020)", fontfamily='serif',fontsize = 20, fontweight = 'bold')

plt.xlabel('Year', fontsize =20)
plt.ylabel('Revenue', fontfamily='Arial Rounded MT Bold', fontsize = 15)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
ax.set_xticklabels(labels = ['2018','2020'], 
                   fontstyle='italic', 
                   color='black');

# setting y axis to display in millions
from matplotlib.ticker import FuncFormatter
def hundred_k(x, pos):
    return f'${x*1e-6:,.2f} M'
price_fmt_100k = FuncFormatter(hundred_k)
ax.yaxis.set_major_formatter(price_fmt_100k)
ax.grid(axis='y', alpha=.3, ls='--') 

plt.savefig('Images/Comparison of 2018 and 2020 Revenue', bbox_inches='tight')

## Third Hypothesis

#### Hypothesis
- $H_0$ (Null Hypothesis): Short and long Movies have the same revenue. 
- $H_A$ (Alternative Hypothesis): Short and long movies have  different revenue. 
#### Test type: Independent T test or 2 sample
- numeric
- two groups
#### Assumptions
- outliers
- normality ( discuss sample size > 15)
- equal variance

### Outliers

In [None]:
# Check for outliers in long film
zscores= stats.zscore(longfilm_runtime)
outliers = abs(zscores)>3
np.sum(outliers)

In [None]:
# remove outlier from long film
longfilm_runtime = longfilm_runtime[(np.abs(stats.zscore(longfilm_runtime)) < 3)]

In [None]:
# Check for outliers in weight for short film  group
zscores= stats.zscore(shortfilm_runtime)
outliers = abs(zscores)>3
np.sum(outliers)


### Normality
- The sample size is > 15. This means that there is no need to test for normality or adjust testing. 
- Check distribution


In [None]:
# test the long film group for normality
result_longfilm = stats.normaltest(longfilm_runtime)
result_longfilm

In [None]:
# test the short film group for normality
result_shortfilm = stats.normaltest(shortfilm_runtime)
result_shortfilm

- p-values for short film group is above   0.05  
- long film group is below 0.05 
- data is NOT normally distributed.
- However sample size is large enough to proceed.

### Equal Variance

In [None]:
# Test for equal variance
result = stats.levene(longfilm_runtime, shortfilm_runtime)
result



In [None]:
# check length
len(longfilm_runtime)


In [None]:
# check length
len(shortfilm_runtime)

- P value is more than 0.05 so equal variance can be set to true.
- It can be concluded that revenue for short and long films are  different and the null hypothesis is rejected. 


### Run test

In [None]:
# Independent t-test with equal_var set to False
result = stats.ttest_ind(longfilm_runtime, shortfilm_runtime, equal_var = True)
result

In [None]:
result.pvalue < 0.05


### Interpretation
- The p value is signficantly less than 0.05. 
- Based on this dataset, the alternate hypothesis is supported
- Runtime does affect revenue. The revenue long films of more than 2.5 hours than short films of 1.5 hours are different.

### Visual

In [None]:
#seperate the columns of interest
long_df= tmdb_df.loc[tmdb_df['runtime'] > 150, ['revenue', 'runtime']]
short_df= tmdb_df.loc[tmdb_df['runtime'] < 90, ['revenue', 'runtime']]

In [None]:
#concatenate the two dataframes
plot_df=pd.concat([long_df,short_df],axis=0)
plot_df

In [None]:
plot_df['long']= plot_df['runtime']> 150
plot_df

In [None]:
plot_df['short']= plot_df['runtime'] < 90
plot_df

In [None]:
#seperate the columns of interest
longfilm= plot_df.loc[  plot_df['long']==True, ['revenue', 'long']]
shortfilm= plot_df.loc[  plot_df['short']==True, ['revenue', 'short']]

In [None]:
# checking long film
ax=sns.barplot(data=plot_df, x='long', y = 'revenue', errorbar=None)

In [None]:
#checking the plots seperately for 90 minutes
ax= sns.barplot(data=plot_df, x='short', y = 'revenue', errorbar=None)

In [None]:
# combined
fig, ax = plt.subplots(figsize=(5,5))

ax=sns.barplot(data=plot_df, x='long', y = 'revenue', errorbar=None)
ax.set_title('Short (1.5 hours) and Long films (2.5 hours) versus  Revenue')
#ax= sns.barplot(data=plot_df, x='short', y = 'revenue', errorbar=None)
plt.xlabel('Runtime in Minutes')

ax.set_xticklabels(labels = ['90 minutes','150 minutes']);

## Fourth Hypothesis
- Are certain genres more popular than others?


#### Hypothesis
- $H_0$ (Null Hypothesis): All genres  earn the same revenue. 
- $H_A$ (Alternative Hypothesis): Different genres earn diffferent revenue. 
#### Test type: Anova/Tukey (parametric) or Kruskall/ANOVA (nonparametric)
- numeric
- two groups
#### Assumptions
- outliers
- normality ( discuss sample size > 15)
- equal variance


### Outliers

In [None]:
# Loop through the items in the groups dictionary
for genre_name, revenue in groups.items():
    # Filter the outliers for the group in the groups dictionary
    outliers = np.abs(stats.zscore(revenue)) > 3
    # Print the sum of outliers for the group
    print(f"{outliers.sum()} outliers were removed from the {genre_name} group.")
    # Remove the outliers from the group
    groups[genre_name] = revenue.loc[~outliers]

### Normality

In [None]:
# Loop through the groups for  pvalue of the Normal Test
norm_results = {}
for genre_name, revenue in groups.items():
    stat, p = stats.normaltest(revenue)
    norm_results[genre_name] = {'n': len(revenue),'p':p}

In [None]:
# Display a dataframe created from the norm_results dictonary and transpose it 
pd.DataFrame(norm_results).T

In [None]:
# Create and display a dataframe created from the norm_results dictonary and transpose it 
norm_results_df = pd.DataFrame(norm_results).T
# Add a column to indicate if the group pvalue was significant or not
norm_results_df['sig?'] = norm_results_df['p'] < .05

In [None]:
# Display the dataframe
norm_results_df

- Assumption of normality can be used because  (each n>20)
- The groups do NOT come from normal distributions.

###  Equal Variance

In [None]:
# Hypotheses
ho = "All input samples are from populations with equal variances."
ha = "All input samples are not from populations with equal variances."
# Run the test
statistic, pvalue = stats.levene(*groups.values())
# Display the interpretation
interpret_pvalue(ho, ha, alpha=.05)  

#### The  Tukey or post hoc multiple test provides the most information on the revenue and genres.


In [None]:
# Slice a sample group from genre
temp = groups['Drama']
temp

In [None]:
# Test making a dataframe from the sample group slice and adding the genre name
pd.DataFrame({'revenue':temp, 'genre_name':'Drama'})

In [None]:
# Make a list for saving the dataframes to
tukeys_dfs = []
for genre_name, revenue in groups.items():
    temp_df = pd.DataFrame({'revenue':revenue, 'genre_name':genre_name})
    tukeys_dfs.append(temp_df)
    
# Concatenate the list of dataframes into 1 dataframe    
tukeys_data = pd.concat(tukeys_dfs)
tukeys_data

In [None]:
# Save the values as revenue and the labels as genre_name
values = tukeys_data['revenue']
labels = tukeys_data['genre_name']

# Perform tukey's multiple comparison test and display the summary
tukeys_results = pairwise_tukeyhsd(values,labels)
tukeys_results.summary()

In [None]:
fig, ax = plt.subplots(nrows=1, figsize=(8,4), facecolor='w')
plt.title('Movie Genres', fontsize = 22, weight='bold')
ax = sns.barplot(data=tukeys_data, x='genre_name', y='revenue', ci=68, palette="cool")
ax.set_xticklabels(ax.get_xticklabels(), rotation='45', ha='right')
plt.xlabel("Genre", fontsize = 16, weight='bold')
plt.xticks(weight='bold')
ax.set_ylabel('Revenue (dollars)',fontweight='bold',fontsize=14)
ax.set_facecolor('lightblue')
ax.tick_params(labelcolor='k', labelsize=10)
ax.set_yticklabels(ax.get_yticks(), weight='bold')
ax.yaxis.set_major_formatter(price_fmt)
for axis in ['top','bottom','left','right']:
    ax.spines[axis].set_linewidth(3)
plt.tight_layout()
plt.show;

In [None]:
order = tukeys_data.groupby(["genre_name"])["revenue"].mean().sort_values(ascending=False).index
ax= sns.barplot(data = tukeys_data, x='genre_name',y='revenue', order=order,
                palette='plasma',errorbar=None);
ax.set_title("Revenue by Genre")

plt.xlabel('Genre')
plt.ylabel('Revenue', fontfamily='Arial Rounded MT Bold', fontsize = 15)
plt.ticklabel_format(style='plain', axis='y')
#ax.tick_params(labelrotation=45)
loc, labels = plt.xticks()
ax.set_xticklabels(labels, rotation=45)

ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)

# setting y axis to display in millions
from matplotlib.ticker import FuncFormatter
def hundred_k(x, pos):
    return f'${x*1e-6:,.0f} M'
price_fmt_100k = FuncFormatter(hundred_k)
ax.yaxis.set_major_formatter(price_fmt_100k)
ax.grid(axis='y', alpha=.3, ls='--') 

plt.savefig('Images/Bar Plot for Revenue by Genre', bbox_inches='tight')
