In [66]:
import os
directory = os.path.dirname(os.getcwd())
import sys
sys.path.append(directory)
import lxml
import datetime
import hashlib
import pandas as pd
import statistics
import numpy as np
from typing import Dict, List
from collections import defaultdict
from ipywidgets import interact

import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.util import ngrams

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
import string

import matplotlib.pyplot as plt
%matplotlib inline

import src.data as data

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Variables

In [2]:
DIRECTORY_DATA = os.path.join(directory, 'data')
DIRECTORY_OUTPUT = os.path.join(DIRECTORY_DATA,'01')

# Parsing Dataset

In [70]:
dblp = data.DatasetDBLP(directory_output=DIRECTORY_OUTPUT)
filepath = os.path.join(DIRECTORY_DATA, 'dblp.xml')
dblp.from_file(filepath)

In [60]:
inproceedings = defaultdict(lambda: defaultdict(list))

In [61]:
all_elements = ['inproceedings']
for event, element in dblp.data:
    if element.tag in all_elements:
        conference_name = element.items()[1][1].split('/')[1]
        for sub in element:
            if sub.tag == 'title':
                t = sub.text
            if sub.tag == 'year':
                d = sub.text
        inproceedings[conference_name][d].append(t)

# Inproceedings Count By Year

In [62]:
def plot_inproceedings_count_by_year(conference_name: str):
    tmp = []
    for k, v in inproceedings[conference_name].items():

        tmp.append([k, len(v)])
    print(tmp)
    df = pd.DataFrame(tmp, columns=['year', 'count'])
    df.sort_values('year', inplace=True)
    df.plot.bar('year', 'count')

interact(plot_inproceedings_count_by_year, conference_name=sorted([conf for conf in list(inproceedings.keys()) if conf]))

interactive(children=(Dropdown(description='conference_name', options=('3dgis', '3dic', '3dica', '3dim', '3dor…

<function __main__.plot_inproceedings_count_by_year(conference_name: str)>

# Title Length by Year

In [65]:
def plot_title_length_by_year(conference_name: str):
    tmp = []
    y = []
    for k, v in inproceedings[conference_name].items():
        lengths = []
        for t in v:
            if not t is None:
                lengths.append(len(t.split(' ')))

        tmp.append( {'label': k,'whislo':min(lengths), 'whishi':max(lengths), 'med':statistics.median(lengths), 'q1':np.percentile(lengths, 25), 'q3':np.percentile(lengths, 75)})
        y.append(k)
    tmp = sorted(tmp, key=lambda x: x['label'])
    y = sorted(y)

    fig, ax = plt.subplots(1,1)
    ax.bxp(tmp, showfliers=False)
    ax.set_xticks(ticks=range(1, len(y)+1),labels=y, rotation=90)

interact(plot_title_length_by_year, conference_name=sorted([conf for conf in list(inproceedings.keys()) if conf]))    

interactive(children=(Dropdown(description='conference_name', options=('3dgis', '3dic', '3dica', '3dim', '3dor…

<function __main__.plot_title_length_by_year(conference_name: str)>

# Top 10s

In [69]:
def generate_top_10_unigrams(conference_name: str):
    years = sorted([int(y) for y in list(inproceedings[conference_name].keys()) if y])
    def get_titles(year: int):
        sw = set(stopwords.words("english") + list(string.punctuation))
        counts = defaultdict(lambda:defaultdict(int))
        titles = inproceedings[conference_name][str(year)]
        for t in titles:
            if t:
                tokens = [word for word in word_tokenize(t) if not word in sw]
                for tok in tokens:
                    counts[year][tok] += 1        
        topten = dict()
        for k, v in counts.items():
            _ = [[k,v] for k,v in v.items()]
            _ = sorted(_,key=lambda x: x[1],reverse=True)
            top = _[:10]
            topten[k] = top
        return topten
    
    interact(get_titles, year=years)    
    
interact(generate_top_10_unigrams, conference_name=sorted([conf for conf in list(inproceedings.keys()) if conf])) 

interactive(children=(Dropdown(description='conference_name', options=('3dgis', '3dic', '3dica', '3dim', '3dor…

<function __main__.generate_top_10_unigrams(conference_name: str)>

In [47]:
def get_years(conference_name: str):
    years = sorted([int(y) for y in list(inproceedings[conference_name].keys()) if y])
    def get_titles(year: int):
        return inproceedings[conference_name][str(year)]
    interact(get_titles, year=years)
    



In [48]:
interact(get_years, conference_name=sorted([conf for conf in list(inproceedings.keys()) if conf]))

interactive(children=(Dropdown(description='conference_name', options=('1970', '2006', '3dgis', '3dic', '3dica…

<function __main__.get_years(conference_name: List)>