# Inferential Analysis

In [15]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import ttest_1samp, t, sem, norm

from statsmodels.formula.api import ols

## Hypothesis Testing

In [2]:
netflix_cat = pd.read_csv('/Users/manuelabueno/Documents/GitHub/Project/finaltable.csv')

In [3]:
netflix_cat.columns

Index(['Unnamed: 0', 'show_id', 'title', 'country', 'year_added',
       'release_year', 'latest_release_year', 'seasons', 'runtimeMinutes',
       'genres', 'main_category', 'audience_rating', 'averageRating',
       'numVotes'],
      dtype='object')

In [14]:
netflix_cat

Unnamed: 0.1,Unnamed: 0,show_id,title,country,year_added,release_year,latest_release_year,seasons,runtimeMinutes,genres,main_category,audience_rating,averageRating,numVotes
0,0,81142594,Tunnel 2019,Unspecified,2019,2019,2019,1,30,Crime,Crime TV Shows,4,8.8,5
1,1,80156992,Breakout,Unspecified,2017,2010,2010,1,45,Action,International TV Shows,3,6.8,5
2,2,80157244,The Dream Job,Singapore,2017,2016,2016,1,45,Family,International TV Shows,3,5.4,5
3,3,80135278,The Beat,Unspecified,2017,1993,2012,1,60,Music,International TV Shows,2,6.4,5
4,4,70308105,Grand Hotel,Spain,2017,2014,2013,3,0,Crime,International TV Shows,2,7.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1502,1502,80025172,Narcos,United States,2017,2015,2017,3,49,Biography,Crime TV Shows,4,8.8,336347
1503,1503,70264888,Black Mirror,United Kingdom,2019,2011,2019,5,60,Drama,British TV Shows,4,8.8,409813
1504,1504,70136126,Dexter,United States,2018,2006,2013,8,53,Crime,Crime TV Shows,4,8.6,626285
1505,1505,70177057,The Walking Dead,United States,2019,2010,2018,9,44,Drama,Classic & Cult TV,4,8.2,829696


In [None]:
#H0 (null hypothesis): the average assist is 52
#H1 (alternative hypothesis): the average assist is not 52

stat,pvalue = stats.ttest_1samp(ast,52) 

if pvalue < 0.05:
    print('stat =', stat, '/ pvalue =', pvalue,
          '\n\nSince pvalue is less than 0.05 :', pvalue,
          '\nWe reject the null hypothesis in favor of the alternative.',
          '\n\nThe average assist is not 52.')
else:
    print('stat =', stat, '/ pvalue =', pvalue,
          '\npvalue is greater than 0.05, null hypothesis is good.',
          '\n\nThe average assist is 52.')

In [None]:
#H0 (null hypothesis) = the average assist is greater or equal to 52
#H1 (alternative hypothesis) = the average assist is less than 52

stat,pvalue = stats.ttest_1samp(ast,52) 


#There is no direct way to indicate that we want to run a one-tailed variant of the test
#But we can adjust the output ourselves, in our case by dividing the p-value by 2 

if (pvalue / 2) < 0.05:
    print('stat =', stat, '/ pvalue =', pvalue,
          '\n\nSince pvalue divided by 2 is less than 0.05 :', (pvalue / 2),
          '\nWe reject the null hypothesis in favor of the alternative.',
          '\n\nThe average assist is less than 52.')
else:
    print('stat =', stat, '/ pvalue =', pvalue,
          'pvalue is greater than 0.05, null hypothesis is good.',
          '\n\nThe average assist is greater or equal to 52.')

## Linear Regression model

In [16]:
ols('averageRating ~ runtimeMinutes', data=netflix_cat)

<statsmodels.regression.linear_model.OLS at 0x7fdf3dceffd0>

In [18]:
fitted_model=ols('averageRating ~ runtimeMinutes', data=netflix_cat).fit()
fitted_model.summary()

0,1,2,3
Dep. Variable:,averageRating,R-squared:,0.023
Model:,OLS,Adj. R-squared:,0.023
Method:,Least Squares,F-statistic:,36.13
Date:,"Tue, 21 Jul 2020",Prob (F-statistic):,2.31e-09
Time:,16:08:55,Log-Likelihood:,-2313.0
No. Observations:,1507,AIC:,4630.0
Df Residuals:,1505,BIC:,4641.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,6.9623,0.037,190.715,0.000,6.891,7.034
runtimeMinutes,0.0036,0.001,6.011,0.000,0.002,0.005

0,1,2,3
Omnibus:,275.956,Durbin-Watson:,1.722
Prob(Omnibus):,0.0,Jarque-Bera (JB):,535.437
Skew:,-1.089,Prob(JB):,5.3900000000000005e-117
Kurtosis:,4.945,Cond. No.,77.0


In [21]:
fitted_model=ols('averageRating ~ runtimeMinutes + year_added + seasons', 
                 data=netflix_cat).fit()
fitted_model.summary()

0,1,2,3
Dep. Variable:,averageRating,R-squared:,0.041
Model:,OLS,Adj. R-squared:,0.039
Method:,Least Squares,F-statistic:,21.38
Date:,"Tue, 21 Jul 2020",Prob (F-statistic):,1.47e-13
Time:,16:10:19,Log-Likelihood:,-2299.4
No. Observations:,1507,AIC:,4607.0
Df Residuals:,1503,BIC:,4628.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-58.4672,47.262,-1.237,0.216,-151.173,34.239
runtimeMinutes,0.0037,0.001,6.202,0.000,0.003,0.005
year_added,0.0323,0.023,1.381,0.167,-0.014,0.078
seasons,0.0865,0.017,5.145,0.000,0.053,0.119

0,1,2,3
Omnibus:,299.22,Durbin-Watson:,1.791
Prob(Omnibus):,0.0,Jarque-Bera (JB):,631.045
Skew:,-1.134,Prob(JB):,9.34e-138
Kurtosis:,5.215,Cond. No.,3320000.0


In [23]:
import Assumptions as ass

In [42]:
x=netflix_cat.averageRating
y=netflix_cat

In [48]:
test=Assumption_Tester_OLS(x, y)
test.run_all() 

NameError: name 'Assumption_Tester_OLS' is not defined