In [79]:
import pandas as pd
import re

In [245]:
df = pd.read_csv('autodeskuni.csv')

## Initial Data Check
First of all I'd like to check the quality of data I scraped and figure out whether it's OK to work with.

In [191]:
df.count()

city             5633
description      5619
key_learnings    5371
tags_industry    5433
tags_product     5192
tags_topics      5595
title            5633
year             5633
dtype: int64

In [192]:
df.isnull().sum(axis = 0)

city               0
description       14
key_learnings    262
tags_industry    200
tags_product     441
tags_topics       38
title              0
year               0
dtype: int64

In [193]:
df.city.value_counts()

Las Vegas                       5150
London                           268
Connect and Construct Summit      83
Middle East                       57
Australia                         44
Forge DevCon                      31
Name: city, dtype: int64

In [194]:
df.groupby(['year', 'city']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,description,key_learnings,tags_industry,tags_product,tags_topics,title
year,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,Las Vegas,23,23,23,23,23,23
2012,Las Vegas,362,358,350,323,360,362
2013,Las Vegas,576,572,513,490,575,576
2014,Las Vegas,639,592,586,599,637,639
2015,Australia,43,2,43,24,36,43
2015,Las Vegas,714,700,692,675,711,714
2016,Las Vegas,683,669,674,615,676,683
2017,Australia,1,0,1,0,0,1
2017,Las Vegas,724,687,701,694,721,725
2017,London,77,71,77,72,76,77


I can see that some locations are not geographic: both Forge DevCon and Connect and Construct Summit took place only once in 2019, so I googled them and found out that they were a part of Autodesk University Las Vegas 2019, so I'll just substitute these values with "Las Vegas".

In [246]:
df['city'] = df['city'].replace(['Forge DevCon', 'Connect and Construct Summit'], 'Las Vegas')

In [247]:
df.groupby(['year', 'city']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,description,key_learnings,tags_industry,tags_product,tags_topics,title
year,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,Las Vegas,23,23,23,23,23,23
2012,Las Vegas,362,358,350,323,360,362
2013,Las Vegas,576,572,513,490,575,576
2014,Las Vegas,639,592,586,599,637,639
2015,Australia,43,2,43,24,36,43
2015,Las Vegas,714,700,692,675,711,714
2016,Las Vegas,683,669,674,615,676,683
2017,Australia,1,0,1,0,0,1
2017,Las Vegas,724,687,701,694,721,725
2017,London,77,71,77,72,76,77


And I will also get rid of Australia 2017 and Middle East 2018 because they only have 1 record each and wouldn't really give any relevant information (I checked them out on the website).

In [248]:
i = df[(df['year'] == 2017) & (df['city'] == 'Australia')].index[0]
df = df.drop(i)

In [249]:
i = df[(df['year'] == 2018) & (df['city'] == 'Middle East')].index[0]
df = df.drop(i)

In [250]:
df.groupby(['year', 'city']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,description,key_learnings,tags_industry,tags_product,tags_topics,title
year,city,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2011,Las Vegas,23,23,23,23,23,23
2012,Las Vegas,362,358,350,323,360,362
2013,Las Vegas,576,572,513,490,575,576
2014,Las Vegas,639,592,586,599,637,639
2015,Australia,43,2,43,24,36,43
2015,Las Vegas,714,700,692,675,711,714
2016,Las Vegas,683,669,674,615,676,683
2017,Las Vegas,724,687,701,694,721,725
2017,London,77,71,77,72,76,77
2018,Las Vegas,738,724,734,710,736,738


## Extract tags
All tags I collected came in lists delimited by "|", so I want to extract lists of unique tags grouped by product, topic and industry.

In [251]:
topics = df['tags_topics'].tolist()
industry = df['tags_industry'].tolist()
product = df['tags_product'].tolist()

In [252]:
def unique_tags(lst):
    lst = [str(i).split("|") for i in lst] #flatten
    lst = set([j for i in lst for j in i if j != "nan"])#unique vals, exclude "nan"
    return lst

In [253]:
tags_topics_unique = unique_tags(topics)
tags_industry_unique = unique_tags(industry)
tags_product_unique = unique_tags(product)

In [254]:
print(len(tags_topics_unique))
print(len(tags_industry_unique))
print(len(tags_product_unique))

201
29
109


In [255]:
tags_product_unique

{'123d',
 '3ds max',
 'a360',
 'advance steel',
 'alias autostudio',
 'alias products',
 'alias speedform',
 'arnold',
 'artcam',
 'assemble products',
 'autocad',
 'autocad 360',
 'autocad architecture',
 'autocad electrical',
 'autocad for mac',
 'autocad lt',
 'autocad lt for mac',
 'autocad map 3d',
 'autocad mechanical',
 'autocad mep',
 'autocad mobile app',
 'autocad p&id',
 'autocad plant 3d',
 'autocad raster design',
 'autocad structural detailing',
 'autocad utility design',
 'autodesk nastran',
 'bim 360',
 'bim 360 account administration',
 'bim 360 design',
 'bim 360 field',
 'bim 360 glue',
 'bim 360 ops',
 'bim 360 plan',
 'bim 360 team',
 'buzzsaw',
 'cfd',
 'civil 3d',
 'configurator 360',
 'dwg trueview',
 'dynamo studio',
 'eagle',
 'ecotect analysis',
 'fabrication cadmep',
 'fabrication camduct',
 'fabrication estmep',
 'fabrication fabmep',
 'fabrication remoteentry',
 'factory design utilities',
 'fbx',
 'featurecam',
 'flame',
 'forge',
 'formit pro',
 'fusion 

In product tags, there are groups of tags that refer to the same product, so I will merge them all together in the data frame (only autocad, bim 360 and revit).

In [256]:
list_autocad = [i for i in tags_product_unique if re.findall('autocad', i)]
list_bim360 = [i for i in tags_product_unique if re.findall('bim 360', i)]
list_revit = [i for i in tags_product_unique if re.findall('revit', i)]

In [257]:
for item in list_autocad:
    df['tags_product'] = df['tags_product'].str.replace(item, 'autocad')

for item in list_bim360:
    df['tags_product'] = df['tags_product'].str.replace(item, 'bim 360')
    
for item in list_revit:
    df['tags_product'] = df['tags_product'].str.replace(item, 'revit')

I'll also merge 'ar/vr', 'augmented reality' and 'augmented reality' in topics tags.

In [258]:
list_ar_vr = ['augmented reality', 'virtual reality']
for item in list_ar_vr:
    df['tags_topics'] = df['tags_topics'].str.replace(item, 'ar/vr')

In [259]:
#extract unique tags again
product = df['tags_product'].tolist()
topics = df['tags_topics'].tolist()
tags_product_unique = unique_tags(product)
tags_topics_unique = unique_tags(topics)

## Selecting desired tags

There are a lot of tags but there are a only a few that I'm interested in, so I'll cherry-pick them:

In [261]:
product_selection = ['autocad', 'bim 360', 'civil 3d', 'dynamo studio', 'revit']
topics_selection = ['3d printing', 'ar/vr', 'artificial intelligence', 'automation, integration, & customization', 'big data', 
                    'laser scanning', 'machine learning', 'robotics', 'robots', 'software development']

In [278]:
data_ml = df[df['tags_topics'].str.contains('machine learning') == True]
data_ai = df[df['tags_topics'].str.contains('artificial intelligence') == True]
data_bigdata = df[df['tags_topics'].str.contains('big data') == True]

city             120
description      120
key_learnings    114
tags_industry    119
tags_product     110
tags_topics      120
title            120
year             120
dtype: int64