# Pitchfork review analysis
### made by DS$^3$
June 26, 2019

In [1]:
# for database
import psycopg2

# for data analyzing
import numpy as np
import pandas as pd

# for statistics
from scipy import stats

# fot visualization
import plotly.graph_objs as go
import plotly.plotly as py
import cufflinks
from plotly.offline import iplot
cufflinks.go_offline()
cufflinks.set_config_file(world_readable=True, theme='ggplot')

# for functions
from functions import * 
%load_ext autoreload
%autoreload 2

In [2]:
conn = psycopg2.connect("dbname='musicreviewdb'user='shufang'")
c = conn.cursor()

In [3]:
c.execute('''select * from genres''')
genres_df = pd.DataFrame(c.fetchall())
genres_df.columns = [x[0] for x in c.description]
genres_df.head()

Unnamed: 0,reviewid,genre
0,22703,electronic
1,22721,metal
2,22659,rock
3,22661,rock
4,22725,electronic


In [4]:
c.execute('''select * from reviews''')
reviews_df = pd.DataFrame(c.fetchall())
reviews_df.columns = [x[0] for x in c.description]
reviews_df.head()

Unnamed: 0,reviewid,title,artist,url,score,best_new_music,author,author_type,pub_date,pub_weekday,pub_day,pub_month,pub_year
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017
2,22659,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017
3,22661,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017
4,22725,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017


In [5]:
c.execute('''select * from labels''')
labels_df = pd.DataFrame(c.fetchall())
labels_df.columns = [x[0] for x in c.description]
labels_df.head()

Unnamed: 0,reviewid,label
0,22703,virgin
1,22721,hathenter
2,22659,static shock
3,22659,fashionable idiots
4,22661,kill rock stars


## Is there a statistical difference between the ratings of two different music genres? 

$H_{0}$: we state the null hypothesis as electronic music is same as the country music ($\mu1=\mu2$)

$H_{1}$: we state the alternative hypothesis as electronic music is different from the country music ($\mu1\neq\mu2$)

In [6]:
# all of the genres
list_of_all_genres = list(genres_df.genre.unique())
list_of_all_genres.remove(None)

In [7]:
list_of_all_genres

['electronic',
 'metal',
 'rock',
 'rap',
 'experimental',
 'pop/r&b',
 'folk/country',
 'jazz',
 'global']

In [8]:
# create a merged dataframe including the information of the ratings and the genres
reviews_genres_df = reviews_df.merge(genres_df, how='left', left_on='reviewid', right_on='reviewid')
reviews_genres_df.head()

Unnamed: 0,reviewid,title,artist,url,score,best_new_music,author,author_type,pub_date,pub_weekday,pub_day,pub_month,pub_year,genre
0,22703,mezzanine,massive attack,http://pitchfork.com/reviews/albums/22703-mezz...,9.3,0,nate patrin,contributor,2017-01-08,6,8,1,2017,electronic
1,22721,prelapsarian,krallice,http://pitchfork.com/reviews/albums/22721-prel...,7.9,0,zoe camp,contributor,2017-01-07,5,7,1,2017,metal
2,22659,all of them naturals,uranium club,http://pitchfork.com/reviews/albums/22659-all-...,7.3,0,david glickman,contributor,2017-01-07,5,7,1,2017,rock
3,22661,first songs,"kleenex, liliput",http://pitchfork.com/reviews/albums/22661-firs...,9.0,1,jenn pelly,associate reviews editor,2017-01-06,4,6,1,2017,rock
4,22725,new start,taso,http://pitchfork.com/reviews/albums/22725-new-...,8.1,0,kevin lozano,tracks coordinator,2017-01-06,4,6,1,2017,electronic


In [9]:
score_genre_df = reviews_genres_df.pivot(
    columns='genre', values='score')
score_genre_df.head()

genre,nan,electronic,experimental,folk/country,global,jazz,metal,pop/r&b,rap,rock
0,,9.3,,,,,,,,
1,,,,,,,7.9,,,
2,,,,,,,,,,7.3
3,,,,,,,,,,9.0
4,,8.1,,,,,,,,


In [10]:
sample_of_electronic = score_genre_df['electronic'].dropna()
sample_of_country = score_genre_df["folk/country"].dropna()
result1 = p_value(sample_of_electronic, sample_of_country,two_sided=True)
alpha = 0.05 # we set the alpha as 0.05
compare_p_value_and_alpha(result1, alpha)

Based on the p value we've calculated(1.1083354234386888e-09) and the alpha value we'e set(0.05), we can reject the null hypothesis and accept the alternative hypothesis


## Is there a statistical difference between the ratings of country music and all other music? 

$H_{0}$: we state the null hypothesis as the country music is same as the other music ($\mu1=\mu2$)

$H_{1}$: we state the alternative hypothesis as the country music is different from the other music ($\mu1\neq\mu2$)

In [15]:
# we need to drop the duplicates
unique_df = reviews_genres_df.drop_duplicates(subset='title',keep=False)
not_country_df = unique_df[unique_df['genre'] != 'folk/country']
country_df = unique_df[unique_df['genre'] == 'folk/country']

In [20]:
sample_of_country_1 = country_df['score'].dropna()
sample_of_not_country = not_country_df['score'].dropna()
result1 = p_value(sample_of_not_country, sample_of_country,two_sided=True)
alpha = 0.05 # we set the alpha as 0.05
compare_p_value_and_alpha(result1, alpha)

Based on the p value we've calculated(1.7227856290435994e-05) and the alpha value we'e set(0.05), we can reject the null hypothesis and accept the alternative hypothesis


## Is there a statistical difference between the ratings of country music and all other music? 

$H_{0}$: we state the null hypothesis as the country music is same as the other music ($\mu1=\mu2$)

$H_{1}$: we state the alternative hypothesis as the country music is different from the other music ($\mu1\neq\mu2$)

### After we've done the hypothesis testing, we can look at the box plot of the scores of different genres

In [21]:
score_genre_df.iplot(
        kind='box',
        layout=dict(
            height=600,
            yaxis=dict(title='scores'),
            title='Scores of music by genres',
            ))