# Analysis of the Age Categories of Activities
This notebook focus on information about the age categories of activities

In [59]:
import sys # never mind these two commands.
sys.path.append('d:/anaconda/lib/site-packages')
import json
import csv
from collections import Counter
import pandas as pd
import re
from nltk.corpus import stopwords

In [2]:
with open('fringe_2019.json') as json_file:
    fringe_2019 = json.load(json_file)

with open('fringe_2018.json') as json_file:
    fringe_2018 = json.load(json_file)
    
with open('fringe_2017.json') as json_file:
    fringe_2017 = json.load(json_file)
    
with open('fringe_2016.json') as json_file:
    fringe_2016 = json.load(json_file)
    
with open('fringe_2015.json') as json_file:
    fringe_2015 = json.load(json_file)
    
with open('fringe_2014.json') as json_file:
    fringe_2014 = json.load(json_file)
    
with open('fringe_2013.json') as json_file:
    fringe_2013 = json.load(json_file)
    
with open('fringe_2012.json') as json_file:
    fringe_2012 = json.load(json_file)

json_file.close()

## Count the age categories of the activities in total

In [3]:
def countAgeCategory(data):
    age = []
    count = []
    for item in data:
        age.append(item['age_category'])

    count = dict(Counter(age))
    count['year'] = data[0]['year']
    
    return count

age_2019 = countAgeCategory(fringe_2019)
age_2019

{'14+': 912,
 '16+': 932,
 '12+': 788,
 '3+': 101,
 '8+': 337,
 '18+': 594,
 '5+': 212,
 '0+': 381,
 'year': 2019}

In [4]:
age_2018 = countAgeCategory(fringe_2018)
age_2017 = countAgeCategory(fringe_2017)
age_2016 = countAgeCategory(fringe_2016)
age_2015 = countAgeCategory(fringe_2015)
age_2014 = countAgeCategory(fringe_2014)
age_2013 = countAgeCategory(fringe_2013)
age_2012 = countAgeCategory(fringe_2012)

In [5]:
data = []
data.append(age_2019)
data.append(age_2018)
data.append(age_2017)
data.append(age_2016)
data.append(age_2015)
data.append(age_2014)
data.append(age_2013)
data.append(age_2012)
len(data)

8

In [6]:
age_category_count = pd.DataFrame(data)
age_category_count = age_category_count.set_index(['year'])
print(age_category_count.columns)
age_category_count

Index(['14+', '16+', '12+', '3+', '8+', '18+', '5+', '0+', 'U', 'PG'], dtype='object')


Unnamed: 0_level_0,14+,16+,12+,3+,8+,18+,5+,0+,U,PG
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019,912,932,788,101.0,337.0,594,212.0,381.0,,
2018,815,870,896,100.0,286.0,548,189.0,281.0,,
2017,690,797,458,,,487,,,806.0,557.0
2016,575,710,464,,,494,,,896.0,479.0
2015,643,659,423,,,408,,,934.0,420.0
2014,581,627,419,,,466,,,850.0,427.0
2013,548,418,386,,,482,,,796.0,355.0
2012,506,343,308,,,547,,,784.0,294.0


In [7]:
age_category_count = age_category_count[['0+', '3+', '5+', '8+', '12+', 'U', 'PG', '14+', '16+', '18+' ]]
age_category_count

Unnamed: 0_level_0,0+,3+,5+,8+,12+,U,PG,14+,16+,18+
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019,381.0,101.0,212.0,337.0,788,,,912,932,594
2018,281.0,100.0,189.0,286.0,896,,,815,870,548
2017,,,,,458,806.0,557.0,690,797,487
2016,,,,,464,896.0,479.0,575,710,494
2015,,,,,423,934.0,420.0,643,659,408
2014,,,,,419,850.0,427.0,581,627,466
2013,,,,,386,796.0,355.0,548,418,482
2012,,,,,308,784.0,294.0,506,343,547


In [8]:
fieldname = []
for item in data:
    fieldname += list(item.keys())
fieldname = list(Counter(fieldname))

for item in data:
    for i in range(0, len(fieldname)):
        if fieldname[i] not in item.keys():
            item[fieldname[i]] = 0

with open('age_category.csv','w',newline='',encoding='utf-8')as f:
    writer = csv.DictWriter(f,fieldnames=fieldname)
    writer.writeheader()
    for i in data:
        writer.writerow(i)

We found that as the number of activities increased year by year, the number of activities of different age categories also increased year by year, and the age categories provided by the activities also changed. Especially in the past two years, the age categories of activities have become more detailed, from the previous U and PG subdivided into 0+, 3+, 5+, 8+ and 12+.
We generate this part of data as csv file to make a clearer stacked bar chart in the infographic. (Please see it in our infographic)

## Deeper analysis of the age categories of activities
Because the age category of activities in the past two years is more detailed, we take the data of 2018 and 2019 for follow analysis.

In [9]:
past2year = fringe_2018 + fringe_2019
seperatedByAge = []

def seperateByAgeCategory(data):
    combine = []
    younger = []
    older = []
    cate1 = ['0+', '3+', '5+', '8+', '12+']
    cate2 = ['14+', '16+', '18+']
    for item in data: 
        if item['age_category'] in cate1:
            younger.append(item)
        if item['age_category'] in cate2:
            older.append(item)
    combine.append(younger)
    combine.append(older)
    return combine

seperatedByAge = seperateByAgeCategory(past2year)
youngerGroup = []
youngerGroup = seperatedByAge[0]
olderGroup = []
olderGroup = seperatedByAge[1]

### 1. Find the differences between age categories under 12 (include"12+") and over 14 (include "14+")
The following codes analyse the differences in the genres of activities of the two groups. The frequency of activity genres of the two groups is calculated respectively.

In [10]:
def countGenres(data):
    genres = []
    count = []
    for item in data:
        genres.append(item['genre'])

    count = dict(Counter(genres))
    
    return count

youngerCounts = countGenres(youngerGroup)
olderCounts = countGenres(olderGroup)
youngerCounts['age'] = 'under 12'
olderCounts['age'] = 'over 14'

In [11]:
ageGenreCount = []
ageGenreCount.append(youngerCounts)
ageGenreCount.append(olderCounts)
ageGenreCount = pd.DataFrame(ageGenreCount)
ageGenreCount = ageGenreCount.set_index('age')
ageGenreCount

Unnamed: 0_level_0,Children's Shows,Theatre,Dance Physical Theatre and Circus,Comedy,Cabaret and Variety,Musicals and Opera,Music,Exhibitions,Events,Spoken Word
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
under 12,322.0,965,213,403,121,191,755,102,335,164
over 14,,1152,60,2566,278,73,305,9,97,131


In [50]:
ageGenreCount.to_csv('age_genre_counts.csv')

As can be seen from the dataframe above, there is a significant difference between the activity genres of age categories under 12 (i.e. 0+, 3+, 5+, 8+, and 12+) and the activity genres of age categories over 14 (i.e. 14+, 16+, and 18+). 

First of all, it's conceivable that there are no activities' genres in over 14 group are "children's shows". And there are interesting differences in other genres of activity. Among "Dance Physical Theatre and Circus", "Musicals and Opera", "Music", "Exhibitions" and "Events", The activities that age categories under 12 are significantly more. Those over 14 are more for "Theatre", "Comedy", "Cabaret and Variety". Finally, there is no obvious difference between the two age groups in "spoken word".

We generate this part of data as csv file to make a clearer tree map in the infographic. (Please see it in our infographic)

### 2. Find the differences among age categories under 12 (i.e. 0+, 3+, 5+, 8+ and 12+)
The following codes analyse the differences among age categories under 12 and the reason to make age categories more nuanced.The word frequency of activities' descriptions is calculated respectively.

In [105]:
def seperateByAgeCategory2(data):
    seperated = []
    zero = []
    three = []
    five = []
    eight = []
    dozen = []
    for item in data: 
        if item['age_category']=='0+':
            zero.append(item)
        if item['age_category']=='3+':
            three.append(item)
        if item['age_category']=='5+':
            five.append(item)
        if item['age_category']=='8+':
            eight.append(item)
        if item['age_category']=='12+':
            dozen.append(item)
    seperated.append(zero)
    seperated.append(three)
    seperated.append(five)
    seperated.append(eight)
    seperated.append(dozen)
    return seperated

seperated = seperateByAgeCategory2(youngerGroup)

In [106]:
def getDescriptions(data):
    descriptions = ''
    for item in data:
        descriptions += item['description']
    return descriptions


def wordCount(data):
    stop_words = stopwords.words('English')
    text = getDescriptions(data)
    
    pattern = r'[\s,\.?!:"]+'
    words = re.split(pattern, text)

    from collections import defaultdict
    result = defaultdict(int)
    for w in words:
        if w not in stop_words:
            result[w] += 1

    return result

In [113]:
zeroCount = wordCount(seperated[0])
zeroDF = pd.DataFrame.from_dict(dict(zeroCount), orient='index', columns=['0+'])
zeroDF = zeroDF.sort_values(by='0+',ascending=False)

threeCount = wordCount(seperated[1])
threeDF = pd.DataFrame.from_dict(dict(threeCount), orient='index', columns=['3+'])
threeDF = threeDF.sort_values(by='3+',ascending=False)

fiveCount = wordCount(seperated[2])
fiveDF = pd.DataFrame.from_dict(dict(fiveCount), orient='index', columns=['5+'])
fiveDF = fiveDF.sort_values(by='5+',ascending=False)

eightCount = wordCount(seperated[3])
eightDF = pd.DataFrame.from_dict(dict(eightCount), orient='index', columns=['8+'])
eightDF = eightDF.sort_values(by='8+',ascending=False)

dozenCount = wordCount(seperated[4])
dozenDF = pd.DataFrame.from_dict(dict(dozenCount), orient='index', columns=['12+'])
dozenDF = dozenDF.sort_values(by='12+',ascending=False)

In [116]:
first25 = pd.concat([zeroDF[:25],threeDF[:25], fiveDF[:25], eightDF[:25], dozenDF[:25]], axis=1)
first25.to_csv('first25.csv')

In [117]:
first25

Unnamed: 0,0+,3+,5+,8+,12+
Fringe,486.0,69.0,106.0,138.0,667.0
The,305.0,141.0,215.0,305.0,728.0
music,224.0,69.0,152.0,185.0,233.0
show,184.0,117.0,168.0,205.0,515.0
Edinburgh,165.0,40.0,81.0,106.0,293.0
A,145.0,61.0,79.0,210.0,479.0
new,137.0,31.0,86.0,120.0,327.0
participants,113.0,,,,
This,113.0,46.0,73.0,100.0,289.0
–,107.0,48.0,96.0,142.0,374.0


From the word frequency data of different age categories, we can see some distinctions in activity content of different age categories. Each age category has its own rich and abundant activities. Generally speaking, they all have lots of "show" and "music". We save this part of data as CSV and generate clearer and more obvious word cloud in infographic. (Please see it in our infographic)