In [1]:
# Typical Everydayers...
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from matplotlib.colors import ListedColormap

import plotly.express as px
from wordcloud import WordCloud, STOPWORDS

# Unicode, Regex, json for text digestion
import unicodedata
import re
import json

import datetime
# Time formatting
from time import strftime
# Make deepcopy
import copy

# Modeling help...
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# nltk: natural language toolkit -> tokenization, stopwords
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer as stemmer
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sia = nltk.sentiment.vader.SentimentIntensityAnalyzer()

# Import prepare
import draft_prepare as p

# Quieeet!!! Y'all can't stop me now...
import warnings
warnings.filterwarnings('ignore')

# Let me see it AAAALLLL!!!
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_rows', 50)

# set default style for charts
plt.rc('figure', figsize=(13, 7))
# plt.style.use('fivethirtyeight')


### Preparing the data:

In [2]:
# df = p.model_clean(df)
# df.head()

In [3]:
# df = pd.read_csv('songs_0526.csv', index_col = 0)
# df.shape

In [4]:
df = p.get_data()


reading csv file...

FileNotFoundError: [Errno 2] No such file or directory: 'songs_0526.csv'

In [None]:
# df = p.get_topics(df)

In [None]:
df.head()


## Topic Modeling

#### Latent Dirichlet Allocation:

In [None]:
# Create an instance
cv = CountVectorizer(max_df = .95, min_df = 2, stop_words = 'english')

# Fit and transform the lemmatized lyrics data
cv_fit = cv.fit_transform(df.lyrics)

print('\nShape of the sparse matrix\n')
cv_fit

In [None]:
# Create the instance for LDA
lda = LatentDirichletAllocation(n_components = 20, random_state = 42)

# Fit the vectorizer with the LDA
lda.fit(cv_fit)

print('Number of topics:', len(lda.components_))
print('Number of columns of the LDA fit', len(lda.components_[0]))

In [None]:
feature = cv.get_feature_names()

print('Length of feature names:', len(feature))

In [None]:
# Display all topic categories in order to label them
for ind, topic in enumerate(lda.components_):
    print('-'*100)
    print('Top 50 words in topic {}'.format(ind))
    print('-'*117)
    top_50 = topic.argsort()[-50:]
    print([feature[i] for i in top_50], '\n\n')


In [None]:
# Final df transforming cv_fit
df_final = lda.transform(cv_fit)

# Make copy to save original df 
df_new = copy.deepcopy(df)


In [None]:
print('\nChecking the probability distribution of one text data belonging to the topic.\n')

print('Few words from 1st row:', df.lyrics[0][:88], '\n')

print('Probability distribution:', df_final[0])


In [None]:
prob = df_final[0][df_final[0].argmax()].round(2)

print('Document belong to the topic', df_final[0].argmax(), 'with the probability of', prob)


In [None]:
df['topic'] = df_final.argmax(axis = 1)

# df_new.head()

In [None]:
# Creating a dictionary with key as topic numbers and value as topic names
topic_label = {0:'Jealousy', 1:'Affection', 2:'Breakup', 3:'Dance', 4:'Holiday', 5:'Nature', 
               6:'Spanish', 7:'Transcendental', 8:'Lost', 9:'Violence', 10:'Youth', 11:'Love', 12:'Heartache', 
               13:'Money', 14:'Affection', 15:'Sex', 16:'Dance', 17:'Good Vibes', 18:'Americana', 19:'Breakup'}

# Mapping the dictionary with the dataframe to get the labels.
df['topic_name'] = df['topic'].map(topic_label)

# Head of the dataframe
# df.head(3)

In [None]:
# def get_topics(df):
#     # Create an instance
#     cv = CountVectorizer(max_df = .95, min_df = 2, stop_words = 'english')
    
#     # Fit and transform the lemmatized lyrics data
#     cv_fit = cv.fit_transform(df.lyrics)

#     # Create the instance for LDA
#     lda = LatentDirichletAllocation(n_components = 20, random_state = 42)
    
#     # Fit the vectorizer with the LDA
#     lda.fit(cv_fit)
    
#     # Pull feature names out and define as feature
#     feature = cv.get_feature_names()
    
#     # Final df transforming cv_fit
#     df_final = lda.transform(cv_fit)
    
# #     # Make copy to save original df 
# #     df_new = copy.deepcopy(df)
    
#     prob = df_final[0][df_final[0].argmax()].round(2)
    
#     # Assign the opics tp the dataframe
#     df['topic'] = df_final.argmax(axis = 1)
    
#     # Creating a dictionary with key as topic numbers and value as topic names
#     topic_label = {0:'Love', 1:'Kind Goodbye', 2:'Appeasing', 3:'Club', 4:'Country Life', 5:'Resentful Goodbye', 
#                    6:'Lost', 7:'Hard Times', 8:'Nature', 9:'Miracles', 10:'Money', 11:'Dance', 12:'Fun', 
#                    13:'Dance', 14:'Weekend', 15:'Transcendental', 16:'Sex', 17:'Summer', 18:'Spanish', 19:'Affection'}
    
#     # Mapping the dictionary with the dataframe to get the labels.
#     df['topic_name'] = df['topic'].map(topic_label)
# #     # Drop the unnecessary duplicate column
# #     df = pd.concat([df, df_new['topic_name']], axis = 1)
#     # Drop unnecessary column 'topic'
#     df = df.drop(columns = ['topic'])
#     return df

In [None]:
# get_topics(df)
# df.head()

### Sentiment

In [None]:
# df['sentiment'] = df.lyrics.apply(lambda msg: sia.polarity_scores(msg)['compound'])

In [None]:
# sentiments = sia
# df["positive"] = [sentiments.polarity_scores(i)["pos"] for i in df["lyrics"]]
# df["negative"] = [sentiments.polarity_scores(i)["neg"] for i in df["lyrics"]]
# df["neutral"] = [sentiments.polarity_scores(i)["neu"] for i in df["lyrics"]]
# df['compound'] = [sentiments.polarity_scores(i)["compound"] for i in data["lyrics"]]

# scores = df["lyrics"].values
# sentiment = []
# for score in scores:
#     if score >= 0.05 :
#         sentiment.append('positive')
#     elif score <= -0.05 :
#         sentiment.append('negative')
#     else:
#         sentiment.append('neutral')
# data["sentiment_class"] = sentiment

## Exploration

In [None]:
# df = df.drop(columns = ['Unnamed: 0'])

In [None]:
def split_data(df):
    '''
    This function takes in a data frame and splits it appropriately in order
    to return a train with 56%, validate with 24%, and test with 20% of the
    original data frame.
    '''
    # Split with train being 80% and test being 20%. Stratify on target.
    train, test = train_test_split(df, test_size = .2, random_state = 123)
    # Split the remaining train into 70% train and 30% validate.
    train, validate = train_test_split(train, test_size = .3, random_state = 123)
    # Spiltting results in a split with 56% train, 24% validate, and 20% test data from original
    return train, validate, test

In [None]:
train, validate, test = split_data(df)
train.shape[0], validate.shape[0], test.shape[0]

In [None]:
# # What song has the lowest sentiment?
# train.sort_values(by = ['sentiment'], ascending = True).head(3)

In [None]:
# # What song has the highest sentiment?
# df.sort_values(by = ['sentiment'], ascending = False).head(3)

In [None]:
# df.sort_values(by = ['topic_name'], ascending = True)

In [None]:
# What is the average sentiment for each topic?
df.groupby(['topic_name']).mean()['sentiment'].sort_values(ascending = False).plot(kind = 'bar')

In [None]:
# Original
# Most popular topics...
def topic_popularity(df):
    df.topic_name.value_counts().plot(kind = 'bar')
    plt.title('Billboard Hot 100 Topic Popularity 1958-Present')
    plt.xlabel('Topic Descriptors')
    plt.xticks(rotation = 35, ha = 'right')
    plt.ylabel('Song Topic Count')
    return

In [None]:
topic_popularity(df)

In [None]:
# Most popular topics...
def topic_popularity(df):
    colors =(
    '#ec1c34', #(red)
    '#fc9d1c', #(orange)
    '#fbdb08', #(yellow)
    '#2dace4', #(blue)
    '#69b138', #(green)
    '#1f1e1b' #(black)
    )
    df.topic_name.value_counts().plot(kind = 'bar', color = colors, figsize = (13,7))
    # plt.figure(figsize=(10,5))
    plt.title('Billboard Hot 100 Topic Popularity 1958-Present', fontsize = 20)
    plt.xlabel('Topic Descriptors', fontsize = 18)
    plt.xticks(rotation = 35, ha = 'right', fontsize = 14)
    plt.ylabel('Song Topic Count', fontsize = 18)
    return

In [None]:
topic_popularity(df)

In [None]:
# What topics are most common in each decade?
df.groupby(['topic_name', 'decade']).size()\
                                    .unstack()\
                                    .sort_values(by = 'topic_name', 
                                                 ascending = False).T\
#                                     .plot(kind = 'bar', ec = 'black')
# plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)

In [None]:
# Original
def all_topics_Prevalence(df):   
    ax = sns.countplot(data = df, x = 'decade', hue = 'topic_name', ec = 'black')
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.title('Topics\' Prevalence Over the Decades')
    ax.set_xticklabels(ax.get_xticklabels(),rotation=25)
    plt.xlabel('Decade of Song')
    plt.ylabel('Song Count')
    plt.show()
    return

In [None]:
all_topics_Prevalence(df)

In [None]:
# Billboard Colors
def all_topics_Prevalence(df):   
    ax = sns.countplot(data = df, x = 'decade', hue = 'topic_name', ec = 'black', palette = palette)
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.title('Topics\' Prevalence Over the Decades')
    ax.set_xticklabels(ax.get_xticklabels(),rotation=25)
    plt.xlabel('Decade of Song')
    plt.ylabel('Song Count')
    plt.show()
    return

In [None]:
all_topics_Prevalence(df)

In [None]:
# Billboard Colors
# make a copy
    df2 = df.copy()
    df2 = df2.set_index('date')
    # add a column to the dataframe where any topic that is a relationship topic is gathered and all 

    ax = df2.groupby('topic_name').resample('Y').size().unstack(0).rolling(5).mean()\
                                      .apply(lambda row: row / row.sum(), axis=1).plot(kind = 'line', linewidth = 3, cmap = cmap)
    # move the legend outside
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.xlim(pd.to_datetime('1960'), pd.to_datetime('2021'))
#     plt.ylim()
    plt.title('Prevalence of Topics in Lyrics')
    plt.xlabel('Year')
    plt.xticks(rotation = 25)
    plt.ylabel('Percentage of Songs')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
    plt.show()

In [None]:
# What are the top 5 topics for each decade?
train.groupby(['decade']).topic_name.value_counts().head(19)


In [None]:
train.topic_name.value_counts()

In [None]:
# #create a variable that stores a list relationship topics
# relationships = ['affection','breakups','love', 'breakup', 
#                  'sex', 'heartache', 'jealousy']
# # make a copy
# train2 = train.copy()
# #add a column to the dataframe where any language not in the top five is represented by 'other'
# train2['relationship_topics'] = np.where(train2.topic_name.isin(relationships), train2.topic_name, 'other')
# train2 = train2.loc[train2['relationship_topics'] != 'other']

In [None]:
# Original
def relationship_bar(df):   
    # create a variable that stores a list relationship topics
    relationships = ['affection','breakups','love', 'breakup', 
                     'sex', 'heartache', 'jealousy']
    # make a copy
    df2 = df.copy()
    # add a column to the dataframe where any topic that is a relationship topic is gathered and all 
    # others are represented by 'other'
    df2['relationship_topics'] = np.where(df2.topic_name.isin(relationships), df2.topic_name, 'other')
    # drop anything that isn't a relationship topic
    df2 = df2.loc[df2['relationship_topics'] != 'other']
    df2.groupby('decade').relationship_topics.value_counts(normalize = True).unstack().plot(kind = 'bar', width = 1, ec = 'black')
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.title('Relationship Topics\' Prevalence Over the Decades')
    plt.xlabel('Decade of Song')
    plt.ylabel('Song Topic Count')
    plt.show()
    return


In [None]:
relationship_bar(df)

In [None]:
# create a variable that stores a list relationship topics
relationships = ['affection','breakups','love', 'breakup', 
                 'sex', 'heartache', 'jealousy']
# make a copy
df2 = df.copy()
df2 = df2.set_index('date')
# add a column to the dataframe where any topic that is a relationship topic is gathered and all 
# others are represented by 'other'
df2['relationship_topics'] = np.where(df2.topic_name.isin(relationships), df2.topic_name, 'other')

# drop anything that isn't a relationship topic
df2 = df2.loc[df2['relationship_topics'] != 'other']
df2.groupby('decade').relationship_topics.value_counts(normalize = True).unstack().plot(kind = 'line')
plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
plt.title('')
plt.xlabel('Decade of Song')
plt.xticks(rotation = 25)
plt.ylabel('% of Songs')
plt.show()


In [None]:
df2.groupby('relationship_topics').resample('2Y').size().unstack(0)\
                                  .apply(lambda row: row / row.sum(), axis=1).plot(kind = 'line')
# move the legend outside
plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)

In [None]:
# Original
def relationship_line(df):
    # create a variable that stores a list relationship topics
    relationships = ['affection','breakups','love', 
                     'sex', 'heartache', 'jealousy']
    # make a copy
    df2 = df.copy()
    df2 = df2.set_index('date')
    # add a column to the dataframe where any topic that is a relationship topic is gathered and all 
    # others are represented by 'other'
    df2['relationship_topics'] = np.where(df2.topic_name.isin(relationships), df2.topic_name, 'other')
    # drop anything that isn't a relationship topic
    df2 = df2.loc[df2['relationship_topics'] != 'other']
    ax = df2.groupby('relationship_topics').resample('Y').size().unstack(0).rolling(5).mean()\
                                      .apply(lambda row: row / row.sum(), axis=1).plot(kind = 'line', linewidth = 3)
    # move the legend outside
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.xlim(pd.to_datetime('1960'), pd.to_datetime('2021'))
#     plt.ylim()
    plt.title('Prevalence of Relationship Topics in Lyrics')
    plt.xlabel('Year')
    plt.xticks(rotation = 25)
    plt.ylabel('Percentage of Songs')
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
    plt.show()
    return


In [None]:
relationship_line(df)

In [None]:
# Billboard Colors
def relationship_line(df):
    # create a variable that stores a list relationship topics
    relationships = ['affection','breakups','love', 
                     'sex', 'heartache', 'jealousy']
    my_cmap = ListedColormap([
    '#fc9d1c', #(orange)    
    '#1f1e1b', #(black)
    '#2dace4', #(blue)
    '#fbdb08', #(yellow)        
    '#69b138', #(green)
    '#ec1c34', #(red)
    ])
    # make a copy
    df2 = df.copy()
    df2 = df2.set_index('date')
    # add a column to the dataframe where any topic that is a relationship topic is gathered and all 
    # others are represented by 'other'
    df2['relationship_topics'] = np.where(df2.topic_name.isin(relationships), df2.topic_name, 'other')
    # drop anything that isn't a relationship topic
    df2 = df2.loc[df2['relationship_topics'] != 'other']
    ax = df2.groupby('relationship_topics').resample('Y').size().unstack(0).rolling(5).mean()\
                                      .apply(lambda row: row / row.sum(), axis=1).plot(kind = 'line', linewidth = 3, cmap = my_cmap)
    # move the legend outside
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0., prop={'size': 15})
    plt.xlim(pd.to_datetime('1960'), pd.to_datetime('2021'))
#     plt.ylim()
    plt.title('Prevalence of Relationship Topics in Lyrics', fontsize = 20)
    plt.xlabel('Year', fontsize = 18)
    plt.xticks(rotation = 25, fontsize = 14)
    plt.ylabel('Percentage of Songs', fontsize = 18)
    plt.yticks(fontsize = 14)
    ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=None, symbol='%', is_latex=False))
    plt.show()
    return

In [None]:
relationship_line(df)

In [None]:
df7 = df.copy()
# train3 = train3.sample(3_000)
df7['affection_v_sex'] = np.where(df7['topic_name'].isin(['good vibes', 'nature', 'americana','youth', 'dance', 
                 'transcendental', 'holiday', 'spanish']), df7['topic_name'], 
                                                None)
ax = sns.swarmplot(data = df7, x = 'affection_v_sex', y = 'date')
ax.set(title = 'Occurence of More Positive Topics')
plt.ylabel('Date')
plt.xlabel('Topic')
plt.show()

In [None]:
# Original
def relationships_swarm(df):  
    df5 = df.copy()
    df5['relationship_topics'] = np.where(df5['topic_name'].isin(['affection','love', 'sex', 
                                                         'heartache', 'jealousy','breakups']), df5['topic_name'], None)
    ax = sns.swarmplot(data = df5, x = 'relationship_topics', y = 'date')
    ax.set(title = '\'Breakup\' and \'Love\' Songs Have A Consistent Presence Over The Decades\nWhile It Appears \'Affection\' And \'Sex\' Show A Trade-off')
    plt.ylabel('Decades')
    plt.xlabel('Relationship Topics')
    return

In [None]:
relationships_swarm(train)

In [None]:
# Billboard Colors
def relationships_swarm(df):  
    df5 = df.copy()
    df5['relationship_topics'] = np.where(df5['topic_name'].isin(['affection','love', 'sex', 
                                                         'heartache', 'jealousy','breakups']), df5['topic_name'], None)
    ax = sns.swarmplot(data = df5, x = 'relationship_topics', y = 'date', palette = palette)
    ax.set(title = '\'Breakup\' and \'Love\' Songs Have A Consistent Presence Over The Decades\nWhile It Appears \'Affection\' And \'Sex\' Show A Trade-off')
    plt.ylabel('Decades')
    plt.xlabel('Relationship Topics')
    return

In [None]:
relationships_swarm(df)

In [None]:
# Original
def touch_swarm(df):  
    df6 = df.copy()
    # train3 = train3.sample(3_000)
    df6['affection_v_sex'] = np.where(df6['topic_name'].isin(['affection','sex']), df6['topic_name'], 
                                                    None)
    ax = sns.swarmplot(data = df6, x = 'affection_v_sex', y = 'date')
    ax.set(title = '\'Affection\' Has Been Replaced By More Explicit \'Sex\' Lyrics')
    plt.ylabel('Date')
    plt.xlabel('Topic')
    return

In [None]:
touch_swarm(df)

In [None]:
# Billboard Colors
def touch_swarm(df):  
    palette = [
    '#ec1c34', #(red)
    '#fc9d1c', #(orange)
#   '#2dace4', #(blue)
#   '#fbdb08', #(yellow)
#   '#69b138' #(green)
              ]
    df6 = df.copy()
    # train3 = train3.sample(3_000)
    df6['affection_v_sex'] = np.where(df6['topic_name'].isin(['affection','sex']), df6['topic_name'], 
                                                    None)
    ax = sns.swarmplot(data = df6, x = 'affection_v_sex', y = 'date', palette = palette)
    plt.title('\'Affection\' Has Been Replaced By More Explicit \'Sex\' Lyrics', fontsize = 20)
#     plt.title(fontsize = 20)
    plt.ylabel('Date', fontsize = 18)
    plt.yticks(fontsize = 14)
    plt.xlabel('Topic', fontsize = 18)
    plt.xticks(fontsize = 14)
    return

In [None]:
sa_swarm(df)

In [None]:
def vice_bar(df):   
    # create a variable that stores a list relationship topics
    vices = ['sex', 'money', 'violence']
    # make a copy
    df3 = df.copy()
    # add a column to the dataframe where any topic that is a vices topic is gathered and all 
    # others are represented by 'other'
    df3['vice_topics'] = np.where(df3.topic_name.isin(vices), df3.topic_name, 'other')
    # drop anything that isn't a relationship topic
    df3 = df3.loc[df3['vice_topics'] != 'other']
    df3.groupby('decade').topic_name.value_counts(normalize = True).unstack().plot(kind = 'bar', width = 1, ec = 'black')
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.title('Vice Topics\' Prevalence Over the Decades')
    plt.xlabel('Decade of Song')
    plt.xticks(rotation = 25)
    plt.ylabel('Song Topic Count')
    plt.show()
    return

In [None]:
vice_bar(df)

In [None]:
def vice_bar(df): 
    # create a variable that stores a list relationship topics
    vices = ['sex', 'money', 'violence']
    # make a copy
    df3 = df.copy()
    # add a column to the dataframe where any topic that is a vices topic is gathered and all 
    # others are represented by 'other'
    df3['vice_topics'] = np.where(df3.topic_name.isin(vices), df3.topic_name, 'other')
    # drop anything that isn't a relationship topic
    df3 = df3.loc[df3['vice_topics'] != 'other']
    df3.groupby('decade').topic_name.value_counts(normalize = True).unstack().plot(kind = 'bar', colormap = cmap, width = 1, ec = 'black')
    plt.legend(bbox_to_anchor = (1.05, 1), loc = 2, borderaxespad=0.)
    plt.title('Vice Topics\' Prevalence Over the Decades')
    plt.xlabel('Decade of Song')
    plt.xticks(rotation = 25)
    plt.ylabel('Song Topic Count')
    plt.show()
    return

In [None]:
vice_bar(df)

In [None]:
#Original
def vice_swarm(df):
    df4 = df.copy()
    # train3 = train3.sample(3_000)
    df4['vices'] = np.where(df4['topic_name'].isin(['sex', 'money', 'violence']), df4['topic_name'], 
                                                    None)
    ax = sns.swarmplot(data = df4, x = 'vices', y = 'date')
    plt.title('Vice Topics Have Increased Significantly Beginning In The 90\'s', fontsize = 20)
    plt.ylabel('Decades',fontsize = 18)
    plt.yticks(fontsize = 14)
    plt.xlabel('Top 3 \'Vice\' Topics',fontsize = 18)
    plt.xticks(fontsize = 14)
    return

In [None]:
vice_swarm(df)

In [None]:
# Billboard Colors
def vice_swarm(df):
    palette = [
    '#ec1c34', #(red)
#   '#fc9d1c', #(orange)
    '#2dace4', #(blue)
#   '#fbdb08', #(yellow)
    '#69b138' #(green)
    ]
    df4 = df.copy()
    # train3 = train3.sample(3_000)
    df4['vices'] = np.where(df4['topic_name'].isin(['sex', 'money', 'violence']), df4['topic_name'], 
                                                    None)
    ax = sns.swarmplot(data = df4, x = 'vices', y = 'date', palette = palette)
    plt.title('Vice Topics Have Increased Significantly Beginning In The 90\'s', fontsize = 20)
    plt.ylabel('Decades',fontsize = 18)
    plt.yticks(fontsize = 14)
    plt.xlabel('Top 3 \'Vice\' Topics',fontsize = 18)
    plt.xticks(fontsize = 14)
    return

In [None]:
vice_swarm(df)

In [None]:
# Example of the Spearman's Rank Correlation Test
from scipy.stats import chi2_contingency as chi2
df4 = df.copy()
df4['vices'] = np.where(df4['topic_name'].isin(['sex', 'money', 'violence']), df4['topic_name'], None)
alpha = .05                                                    
data1 = df4.vices
data2 = df4.decade
stat, p = chi2(data1, data2)
print('stat=%.3f, p=%.3f' % (stat, p))
if p <= alpha:
    print('Reject NULL HYPOTHESIS') 
else: 
    print('ACCEPT NULL HYPOTHESIS') 