# Movie Rating Hypothesis testing

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from IPython.display import HTML, display

#pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 25)

In [2]:
%matplotlib inline

##### Read and cleanup the movie data

In [3]:
def get_movie_data():
    
    unames = ['user_id','gender','age','occupation','zip']
    users = pd.read_table(os.path.join('../data/movie','users.dat'), 
                          sep='::', header=None, names=unames, encoding='latin-1')
    
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table(os.path.join('../data/movie', 'ratings.dat'), 
                            sep='::', header=None, names=rnames, encoding='latin-1')
    
    mnames = ['movie_id', 'title','genres']
    movies = pd.read_table(os.path.join('../data/movie', 'movies.dat'), 
                           sep='::', header=None, names=mnames, encoding='latin-1')

    return users, ratings, movies

users, ratings, movies = get_movie_data()

tmp = movies.title.str.extract('(.*) \(([0-9]+)\)')
tmp.apply(lambda x:x[0] if len(x) > 0 else None)
tmp.apply(lambda x: x[0][:40] if len(x) > 0 else None)

movies['year'] = tmp[1]
movies['short_title'] = tmp[0]

print(movies.head())

  return func(*args, **kwargs)


   movie_id                               title                        genres  \
0         1                    Toy Story (1995)   Animation|Children's|Comedy   
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2         3             Grumpier Old Men (1995)                Comedy|Romance   
3         4            Waiting to Exhale (1995)                  Comedy|Drama   
4         5  Father of the Bride Part II (1995)                        Comedy   

   year                  short_title  
0  1995                    Toy Story  
1  1995                      Jumanji  
2  1995             Grumpier Old Men  
3  1995            Waiting to Exhale  
4  1995  Father of the Bride Part II  


### 1. Prove or disprove the following Null Hypothesis:

#### H0: Different age segments of users rate the different genres of movies similarly

- Age segments are (0-20), (21-30), (31-50), (50 and above),
- Genres - Drama, Comedy, Action, Romance, Adventure
- Rating 4 or above is a positive, else negative (used for binary votes)

#### Solution

- We are asked to approve or disaprove relation between the ratings, genres and age segments.
- To identify the relation between the different categorical variables **Chi - squared test** is carried out as follows.

In [4]:
#merging all the three datasets (ratings, users and movies) into one dataframe.
r_u= pd.merge(ratings,users, on='user_id')
r_u_m=pd.merge(r_u,movies,on='movie_id')
r_u_m

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,1998,Modulations
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,1998,Broken Vessels
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,1999,White Boys
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,1973,One Little Indian


### creating a label for ratings

In [5]:
rat=r_u_m['rating']

In [6]:
votes = []
for i in rat:
    if i >= 4 : votes.append("1")
    if i < 4: votes.append("0")

r_u_m["ratings_label"]=votes

In [7]:
r_u_m

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title,ratings_label
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,1998,Modulations,1
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,1998,Broken Vessels,0
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,1999,White Boys,0
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,1973,One Little Indian,1


### creating label for age

In [8]:
age=r_u_m['age']

In [9]:
age_group = []
for i in age:
    if (i >= 0) & (i < 20) : age_group.append("group1")
    if (i >= 21) & (i < 30) : age_group.append("group2")
    if (i >= 31) & (i < 50) : age_group.append("group3")
    if (i >= 50) : age_group.append("group4")

r_u_m["age_group"]=age_group

In [10]:
r_u_m

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title,ratings_label,age_group
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group1
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group4
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group2
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group2
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,1998,Modulations,1,group1
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,1998,Broken Vessels,0,group3
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,1999,White Boys,0,group1
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,1973,One Little Indian,1,group1


In [11]:
r_u_m = r_u_m[r_u_m["ratings_label"] == "1"]
r_u_m

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title,ratings_label,age_group
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group1
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group4
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group2
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group2
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000199,5334,3382,5,960796159,F,56,13,46140,Song of Freedom (1936),Drama,1936,Song of Freedom,1,group4
1000202,5494,3530,4,959816296,F,35,17,94306,Smoking/No Smoking (1993),Comedy,1993,Smoking/No Smoking,1,group3
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,1998,Modulations,1,group1
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,1973,One Little Indian,1,group1


In [12]:
r_u_m['ratings_label'].value_counts()

1    575281
Name: ratings_label, dtype: int64

In [13]:
r_u_m['age_group'].value_counts()

group2    222554
group3    166169
group1    116409
group4     70149
Name: age_group, dtype: int64

In [14]:
r_u_m['genres'].value_counts()

Drama                                  72695
Comedy                                 62293
Comedy|Drama                           26468
Comedy|Romance                         23353
Drama|Romance                          17074
                                       ...  
Action|Adventure|Children's|Fantasy        6
Children's|Fantasy                         6
Film-Noir|Horror                           2
Action|Adventure|Children's                1
Fantasy                                    1
Name: genres, Length: 300, dtype: int64

### extracting the data according to the genres

In [15]:
g1 = r_u_m.loc[(r_u_m['genres']=='Drama') | (r_u_m['genres']=='Action') | (r_u_m['genres']=='Adventure') | (r_u_m['genres']=='Comedy') | (r_u_m['genres']=='Romance')]
g1

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title,ratings_label,age_group
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group1
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group4
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group2
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group2
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest,1,group4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1000183,5059,1434,4,962484364,M,45,16,22652,"Stranger, The (1994)",Action,1994,"Stranger, The",1,group3
1000184,5947,1434,4,957190428,F,45,16,97215,"Stranger, The (1994)",Action,1994,"Stranger, The",1,group3
1000192,5754,2543,4,958272316,F,18,1,60640,Six Ways to Sunday (1997),Comedy,1997,Six Ways to Sunday,1,group1
1000199,5334,3382,5,960796159,F,56,13,46140,Song of Freedom (1936),Drama,1936,Song of Freedom,1,group4


- Strictly selecting only the genres witch are desired.


### observed results

In [16]:
ct=pd.crosstab(r_u_m['age_group'], [r_u_m['ratings_label'],g1['genres']], margins=True)
ct

ratings_label,1,1,1,1,1,All
genres,Action,Adventure,Comedy,Drama,Romance,Unnamed: 6_level_1
age_group,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
group1,1070,94,14685,12784,175,28808
group2,2359,286,25642,27233,359,55879
group3,1813,423,16189,21516,345,40286
group4,737,279,5777,11162,162,18117
All,5979,1082,62293,72695,1041,143090


In [17]:
ct.columns = ["Action","Adventure","Comedy","Drama","Romance","row_total"]

ct.index = ["0-20","21-30","31-50","50 and above","col_total"]

In [18]:
ct

Unnamed: 0,Action,Adventure,Comedy,Drama,Romance,row_total
0-20,1070,94,14685,12784,175,28808
21-30,2359,286,25642,27233,359,55879
31-50,1813,423,16189,21516,345,40286
50 and above,737,279,5777,11162,162,18117
col_total,5979,1082,62293,72695,1041,143090


In [19]:
n_columns=5
n_rows=4
d_f=(n_columns-1)*(n_rows-1)
print("degree of freedom is :",d_f)

degree of freedom is : 12


For a test of independence, 
- we use the same chi-squared formula that we used for the goodness-of-fit test. 

- The main difference is we have to calculate the expected counts of each cell in a 2-dimensional table instead of a 1-dimensional table. 

- To get the expected count for a cell, multiply the row total for that cell by the column total for that cell and then divide by the total number of observations. 

$$E_{i,j} = \frac{row\_totals_i * col\_totals_j}{n\_obs}$$

and then,

$$ \chi^2 = \sum(\frac{(O_{i,j}−E_{i,j})^2}{E_{i,j}})$$


- We can quickly get the expected counts for all cells in the table by taking the row totals and column totals of the table.
    - performing an outer product on them with the `np.outer()` function and dividing by the number of observations:

In [20]:
expected =  np.outer(ct["row_total"][0:4],ct.loc["col_total"][0:5]) / 143090
expected = pd.DataFrame(expected)
expected.columns = ["Action","Adventure","Comedy","Drama","Romance"]
expected.index = ["0-20","21-30","31-50","50 and above"]
expected

Unnamed: 0,Action,Adventure,Comedy,Drama,Romance
0-20,1203.739129,217.836718,12541.314865,14635.527011,209.582277
21-30,2334.897903,422.538808,24326.441729,28388.593927,406.527633
31-50,1683.346104,304.629618,17538.16338,20466.774547,293.086351
50 and above,757.016864,136.994856,7887.080027,9204.104515,131.803739


### The above table shows the expected values for all the categories

In [21]:
observed = ct.iloc[0:4,0:5]   # Get table without totals for later use
observed

Unnamed: 0,Action,Adventure,Comedy,Drama,Romance
0-20,1070,94,14685,12784,175
21-30,2359,286,25642,27233,359
31-50,1813,423,16189,21516,345
50 and above,737,279,5777,11162,162


In [22]:
stats.chi2_contingency(observed= observed)

(2218.1345990465156,
 0.0,
 12,
 array([[ 1203.73912922,   217.83671815, 12541.31486477, 14635.52701097,
           209.58227689],
        [ 2334.89790342,   422.53880774, 24326.44172898, 28388.5939269 ,
           406.52763296],
        [ 1683.34610385,   304.62961772, 17538.16337969, 20466.77454749,
           293.08635125],
        [  757.01686351,   136.99485638,  7887.08002656,  9204.10451464,
           131.80373891]]))

- The first three values are the chi statistic, p-value,degree of freedom.

In [23]:
chi_squared_stat = (((observed-expected)**2)/expected).sum().sum()

print(chi_squared_stat)

2218.134599046515


In [24]:
crit = stats.chi2.ppf(q = 0.95,df = 12)
print("Critical value")
print(crit)
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat,df=12)
print("P value")
print(p_value)

Critical value
21.02606981748307
P value
0.0


- chi squared statistic is 2218.134599046515.
- p - value is 0.0.
- degree of freedom is 12.
- The p-value is 0.0, so at the significance level of 0.05, we can say that p value is less than the significance level.
- So, our null hypothesis fails and can be said that **Different age segments of users rate the different genres of movies differently**.