## Data Cleaning
Cleaning the raw data taken from the wikipedia scraper.

In [1]:
import numpy as np

In [6]:
with open("Data/metadata_final.csv") as f:
    raw_lines = f.readlines()
    processed = [line.split("\t") for line in raw_lines]
    node_info = dict([(int(line[0]), eval(line[1])) for line in processed])

In [10]:
for num in node_info:
    print(node_info[num]["title"])

1c:enterprise programming language
a#
a-0 system
a+
a++
abap/4
abc
abc algol
acc (programming language)
accent (programming language)
distributed application specification language
action!
actionscript
actor (programming language)
ada
adenine (programming language)
agda (theorem prover)
agilent vee
agora (programming language)
aimms
aldor
alef
alf
algol 58
algol 60
algol 68
algol w
alice
alma-0
ambienttalk
amiga e
amos
ampl
angelscript
apex (programming language)
apl
app inventor for android (programming language)
applescript
apt (programming language)
arc
arexx
argus (programming language)
assembly language
autohotkey
autolisp
averest
awk
axum
active server pages (asp)
b
babbage
ballerina
bash
basic
bc
bcpl
beanshell
batch file
bertrand (programming language)
beta
bliss (programming language)
blockly
bloop and floop
boo
boomerang
bourne shell
c
c--
c++
c*
c#
c/al
caché objectscript
c shell
caml
cayenne (programming language)
cduce
cecil (programming language)
cesil
céu (programming la

In [17]:
edges = set()

for num in node_info:
    lang = node_info[num]
    
    for key in lang['influenced']:
        edges.add((num, key))
    
    for key in lang['influenced by']:
        edges.add((key, num))
    

In [20]:
with open("Data/edges.txt", "w") as f:
    for parent, child in edges:
        f.write(f"{parent} {child}\n")

## Node info file

In [37]:
import re

In [48]:
pat = re.compile(".*(\d{4}).*")
pat.match("December 1234").group(1)

'1234'

In [51]:
year_finder = re.compile(".*(\d{4}).*")

for key in node_info:
    year = node_info[key]["year"]
    
    if type(year) == str:
        new_year = year_finder.match(year)
        
        if new_year == None:
            print(f"Year not found in pattern: {year}")
        else:
            year = str(new_year.group(1))
    
    node_info[key]["clean year"] = year

In [57]:
with open("Data/node_info.tsv", "w") as f:
    for key in node_info:
        d = node_info[key]
        f.write(f"{key}\t{d['title']}\t{d['clean year']}\t{d['paradigm(s)']}\t{d['typing']}\n")

### Joining multiple entries for the same programming language

In [1]:
with open("Data/node_info.tsv") as f:
    raw_lines = f.readlines()
    processed = [row.split('\t') for row in raw_lines]
    
    nodes = {}
    
    for line in processed:
        lang = {}
        lang_id = int(line[0])
        
        lang["name"] = line[1]
        lang["year"] = int(line[2])
        lang["paradigm"] = eval(line[3])
        lang["typing"] = eval(line[4])
        
        nodes[lang_id] = lang

In [20]:
repeats = set()
names = set()

for key in nodes:
    name = nodes[key]["name"]
    if name in names:
        repeats.add(name)
    else:
        names.add(name)

for name in repeats:
    for key in nodes:
        if nodes[key]["name"] == name:
            print(nodes[key])
    print()

{'name': 'interlisp', 'year': 1968, 'paradigm': [], 'typing': []}
{'name': 'interlisp', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'qbasic', 'year': 0, 'paradigm': [], 'typing': []}
{'name': 'qbasic', 'year': 1991, 'paradigm': ['Procedural'], 'typing': []}
{'name': 'qbasic', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'korn shell', 'year': 0, 'paradigm': [], 'typing': []}
{'name': 'korn shell', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'fortran', 'year': 1957, 'paradigm': ['multi-paradigm', 'structured', 'imperative', 'procedural', 'object-oriented', 'generic'], 'typing': ['strong', 'static', 'manifest']}
{'name': 'fortran', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'rebol', 'year': 0, 'paradigm': [], 'typing': []}
{'name': 'rebol', 'year': 1997, 'paradigm': ['language oriented programming', 'data exchange', 'functional', 'prototype-based', 'imperative'], 'typing': ['dynamic', 'strong']}

{'name': 'quickbasic', 'year': 0, 'paradigm': [], 'typing': []}
{

In [16]:
for key in nodes:
    if nodes[key]["name"] == 'swift':
        print(key)

42946389
42946796


In [18]:
nodes[42946796]["name"] = "swift (parallel scripting language)"

In [28]:
double_entries = {}

for repeat in repeats:
    double_entries[repeat] = []

for key in nodes:
    lang = nodes[key]
    
    if lang["name"] in double_entries:
        entry = lang.copy()
        entry["id"] = key
        double_entries[lang["name"]].append(entry)

In [48]:
def merge_dicts(ds):
    acc = {'year': 0, 'paradigm':[], 'typing': []}
    
    d_scores = [0 for i in range(len(ds))]
    
    name = ds[0]['name']
    acc["name"] = name
    
    for i in range(len(ds)):
        if ds[i]["year"] != 0:
            acc["year"] = ds[i]['year']
            d_scores[i] += 1
        if len(ds[i]['paradigm']) != 0:
            acc["paradigm"] += ds[i]['paradigm']
            d_scores[i] += 1
        if len(ds[i]['typing']) != 0:
            acc["typing"] += ds[i]['typing']
            d_scores[i] += 1
    
    max_index = max(range(len(d_scores)), key=lambda x: d_scores[x])
    new_id = ds[max_index]['id']
    acc['id'] = new_id
    
    mapping = {}
    for d in ds:
        if d['id'] != new_id:
            mapping[d['id']] = new_id
    
    return acc, mapping

In [52]:
acc, mapping = merge_dicts(double_entries['fortran'])

In [53]:
acc

{'year': 1957,
 'paradigm': ['multi-paradigm',
  'structured',
  'imperative',
  'procedural',
  'object-oriented',
  'generic'],
 'typing': ['strong', 'static', 'manifest'],
 'name': 'fortran',
 'id': 11168}

In [54]:
mapping

{10942: 11168}

In [58]:
mapping = {}

for name in double_entries:
    acc, new_ids = merge_dicts(double_entries[name])
    nodes[acc['id']] = acc
    
    for key in new_ids:
        nodes.pop(key)
        mapping[key] = new_ids[key]

Applying new mapping to the edge list

In [65]:
with open("Data/edges.txt") as f:
    raw_lines = f.readlines()
    processed_lines = [tuple(map(int, line.strip().split(" "))) for line in raw_lines]

In [69]:
mapped = [tuple(map(lambda x: mapping[x] if x in mapping else x, tup)) for tup in processed_lines]

In [71]:
with open("edges.txt", "w") as f:
    for i, j in mapped:
        f.write(f"{i} {j}\n")

Write new node info to file

In [74]:
nodes[60614939]

{'name': '1c:enterprise programming language',
 'year': 0,
 'paradigm': [],
 'typing': []}

In [75]:
with open("Data/node_info.tsv", "w") as f:
    for key in nodes:
        d = nodes[key]
        f.write(f"{key}\t{d['name']}\t{d['year']}\t{d['paradigm']}\t{d['typing']}\n")

### Getting rid of entries that aren't programming languages

In [1]:
with open("Data/node_info.tsv") as f:
    raw_lines = f.readlines()
    processed = [row.split('\t') for row in raw_lines]
    
    nodes = {}
    
    for line in processed:
        lang = {}
        lang_id = int(line[0])
        
        lang["name"] = line[1]
        lang["year"] = int(line[2])
        lang["paradigm"] = eval(line[3])
        lang["typing"] = eval(line[4])
        
        nodes[lang_id] = lang

In [3]:
a = list(nodes.keys())[:10]

In [9]:
remove_ls = []

for key in nodes:
    resp = input(nodes[key]["name"])
    if resp == "y":
        remove_ls.append(key)

1c:enterprise programming language
a#
a-0 system
a+
a++
abap/4
abc
abc algol
acc (programming language)
accent (programming language)
distributed application specification language
action!
actionscript
actor (programming language)
ada
adenine (programming language)
agda (theorem prover)
agilent vee
agora (programming language)
aimms
aldor
alef
alf
algol 58
algol 60
algol 68
algol w
alice
alma-0
ambienttalk
amiga e
amos
ampl
angelscript
apex (programming language)
apl
app inventor for android (programming language)
applescript
apt (programming language)
arc
arexx
argus (programming language)
assembly language
autohotkey
autolisp
averest
awk
axum
active server pages (asp)
b
babbage
ballerina
bash
basic
bc
bcpl
beanshell
batch file
bertrand (programming language)
beta
bliss (programming language)
blockly
bloop and floop
boo
boomerang
bourne shell
c
c--
c++
c*
c#
c/al
caché objectscript
c shell
caml
cayenne (programming language)
cduce
cecil (programming language)
cesil
céu (programming la

stos basic
coopr
mathematical notationy
natural language programming
javascript (programming language)
autoit
xslt
category:c programming language familyy
c99y
dataparallel-c
ansi cy
*lisp (starlisp)
typescript
cω
high performance fortrany
cray mta
cray xmt
pl/1
openmp
hy
lfe
aimaco
fact computer language
yaml
eulisp
islisp
moose (perl)
skill
subl
lisp machine lisp
maclisp
interlisp
oberon-2
algol
inform
html(hypertext markup language)
minid
qore
foxpro
vp-info[1]
windows powershelly
common lisp object systemy
redux
vue.jsy
agda
alf (theorem prover)
akka
reia (programming language)
simple theory of types
croquet
hypercard
starlogo
agentsheets
concurrent euclid (programming language)
turing
fortran 95
dafny (programming language)
arith-matic
basic-plus
burroughs large systems
pact i
allegro common lisp
bluespec, inc.y
c++11y
concepts (c++)y
language integrated queryy
isabelle theorem prover
generics in javay
omega
purescript
raku
gofer (programming language)
id (programming language)
is

In [10]:
print(remove_ls)

[1320860, 92577, 840451, 390263, 34180789, 6761437, 41754003, 313216, 307436, 34739391, 208996, 7528520, 25432026, 277184, 945831, 607497, 3172, 1081482, 4895712, 191414, 50978621, 50462918, 5481447, 23708477, 13706337, 7955681, 381782, 10611640, 1637868, 28005, 14789, 23454753, 4086, 53891883, 29245633, 57131254, 10933, 733576, 22758, 292961, 4230, 1800920, 60675294, 20607025, 185449, 40659930, 53398, 48718969, 1774611]


In [11]:
remove = set(["ch c/c++ interpreter","templeos","javafx", "pig (programming tool)", "algol-like"])

for key in nodes:
    if nodes[key]['name'] in remove:
        remove_ls.append(key)

In [12]:
for key in nodes:
    if nodes[key]['name'] == "ethereum":
        print(key)

41754003


In [13]:
print(remove_ls)

[1320860, 92577, 840451, 390263, 34180789, 6761437, 41754003, 313216, 307436, 34739391, 208996, 7528520, 25432026, 277184, 945831, 607497, 3172, 1081482, 4895712, 191414, 50978621, 50462918, 5481447, 23708477, 13706337, 7955681, 381782, 10611640, 1637868, 28005, 14789, 23454753, 4086, 53891883, 29245633, 57131254, 10933, 733576, 22758, 292961, 4230, 1800920, 60675294, 20607025, 185449, 40659930, 53398, 48718969, 1774611, 31643142, 46478098, 11117691, 57360209, 15986053]


In [14]:
remove_ls = [1320860, 92577, 840451, 390263, 34180789, 6761437, 313216, 307436, 34739391, 208996, 7528520, 25432026, 277184, 945831, 607497, 3172, 1081482, 4895712, 191414, 50978621, 50462918, 5481447, 23708477, 13706337, 7955681, 381782, 10611640, 1637868, 28005, 14789, 23454753, 4086, 53891883, 29245633, 57131254, 10933, 733576, 22758, 292961, 4230, 1800920, 60675294, 20607025, 185449, 40659930, 53398, 48718969, 1774611, 31643142, 46478098, 11117691, 57360209, 15986053]

In [15]:
for lid in remove_ls:
    nodes.pop(lid)

In [16]:
with open("Data/node_info.tsv", "w") as f:
    for key in nodes:
        d = nodes[key]
        f.write(f"{key}\t{d['name']}\t{d['year']}\t{d['paradigm']}\t{d['typing']}\n")

In [4]:
with open("edges.txt") as f:
    raw_lines = f.readlines()
    processed_lines = [tuple(map(int, line.strip().split(" "))) for line in raw_lines]

In [5]:
remove_set = set([1320860, 92577, 840451, 390263, 34180789, 6761437, 313216, 307436, 34739391, 208996, 7528520, 25432026, 277184, 945831, 607497, 3172, 1081482, 4895712, 191414, 50978621, 50462918, 5481447, 23708477, 13706337, 7955681, 381782, 10611640, 1637868, 28005, 14789, 23454753, 4086, 53891883, 29245633, 57131254, 10933, 733576, 22758, 292961, 4230, 1800920, 60675294, 20607025, 185449, 40659930, 53398, 48718969, 1774611, 31643142, 46478098, 11117691, 57360209, 15986053])

with open("Data/edges.txt", "w") as f:
    for i, j in processed_lines:
        if i not in remove_set and j not in remove_set:
            f.write(f"{i} {j}\n")

### There were some entries with `|name| programming language`

In [1]:
with open("Data/node_info.tsv") as f:
    raw_lines = f.readlines()
    processed = [row.split('\t') for row in raw_lines]
    
    nodes = {}
    
    for line in processed:
        lang = {}
        lang_id = int(line[0])
        
        lang["name"] = line[1]
        lang["year"] = int(line[2])
        lang["paradigm"] = eval(line[3])
        lang["typing"] = eval(line[4])
        
        nodes[lang_id] = lang

In [2]:
suspect = []

for key in nodes:
    if "programming language" in nodes[key]['name']:
        suspect.append(key)

In [3]:
for lid in suspect:
    print(nodes[lid]['name'])

1c:enterprise programming language
acc (programming language)
accent (programming language)
actor (programming language)
adenine (programming language)
agora (programming language)
apex (programming language)
app inventor for android (programming language)
apt (programming language)
argus (programming language)
bertrand (programming language)
bliss (programming language)
cayenne (programming language)
cecil (programming language)
céu (programming language)
cg (programming language)
clips (programming language)
combined programming language
cybil (programming language)
d programming language
dog (programming language)
dynamo (programming language)
egl (programming language)
easy programming language
eltron programming language
euslisp robot programming language
ffp (programming language)
flavors (programming language)
foil (programming language)
formac (programming language)
george (programming language)
grass (programming language)
groovy (programming language)
hermes (programming lang

In [4]:
import re

In [5]:
pat = re.compile("(.+) \(?programming language\)?")
pat.match("acc (programming language)").group(1)

'acc'

In [6]:
lang_names = [nodes[entry]['name'] for entry in nodes]

In [7]:
suspect

[60614939,
 928669,
 85065,
 26849115,
 11517646,
 933477,
 49183653,
 43500036,
 3673047,
 27580389,
 21119400,
 24740516,
 6528823,
 527946,
 52350190,
 390212,
 7755022,
 31613960,
 32669131,
 243881,
 37830261,
 26064582,
 1205107,
 42016798,
 8664562,
 7416368,
 934157,
 1332640,
 602746,
 55755598,
 36133392,
 144766,
 57417762,
 24136948,
 39338454,
 933188,
 1151504,
 40650677,
 5887624,
 4400159,
 52836480,
 1982671,
 493076,
 12073324,
 908572,
 11730351,
 2211835,
 18530,
 51974468,
 2988758,
 2232731,
 3359079,
 51603617,
 948998,
 6378343,
 928466,
 59627681,
 294856,
 37469066,
 16840885,
 3304684,
 1936835,
 832032,
 31005079,
 2603123,
 350323,
 25663569,
 5005125,
 931356,
 356693,
 8787221,
 481275,
 12640293,
 32475185,
 4449554,
 30772911,
 485875,
 485828,
 7837169,
 2199610,
 1719114,
 3092830,
 31410310,
 43528524,
 39153713,
 26657733,
 31495398,
 928636,
 25417347,
 12572874,
 20217814,
 23153417,
 57798059,
 9019708,
 15127771,
 7239022,
 6135484,
 42302403,
 

In [8]:
pat = re.compile("(.+) \(programming language\)")

for lid in suspect:
    re_pat = pat.match(nodes[lid]['name'])
    if re_pat:
        short_name = re_pat.group(1)
        print(lid, short_name)
        nodes[lid]['name'] = short_name

928669 acc
85065 accent
26849115 actor
11517646 adenine
933477 agora
49183653 apex
43500036 app inventor for android
3673047 apt
27580389 argus
21119400 bertrand
24740516 bliss
6528823 cayenne
527946 cecil
52350190 céu
390212 cg
7755022 clips
32669131 cybil
37830261 dog
26064582 dynamo
1205107 egl
934157 ffp
1332640 flavors
602746 foil
55755598 formac
36133392 george
144766 grass
57417762 groovy
24136948 hermes
39338454 hopscotch
933188 hope
1151504 hugo
40650677 inform
5887624 jade
4400159 kaleidoscope
52836480 kojo
1982671 lava
493076 lingo
12073324 lis
908572 lithe
11730351 lse
2211835 lustre
18530 lynx
51974468 m sharp
2988758 magik
2232731 microscript
3359079 miis
51603617 milk
6378343 mouse
928466 mpd
59627681 neko
294856 nice
16840885 oak
3304684 onyx
1936835 opal
2603123 pearl
25663569 powerhouse
931356 roop
356693 s-lang
8787221 sa-c
481275 sail
12640293 sawzall
32475185 signal
4449554 slip
485875 sr
7837169 strand
2199610 subtext
1719114 tacpol
3092830 unity
31410310 watfiv
4

In [9]:
pat = re.compile("(.+) programming language")

for lid in suspect:
    re_pat = pat.match(nodes[lid]['name'])
    if re_pat:
        short_name = re_pat.group(1)
        if short_name in lang_names:
            print(lid, short_name)
            nodes[lid]['name'] = short_name

25417347 ada
7239022 unicon
7238799 miranda
9123867 sasl
261121 algol
7238785 lisp
7071680 io
7238578 dylan
7238495 apl
7190667 scala
7389219 ml
7238710 icon
7239012 t


Repeat the joining thing from before

In [10]:
repeats = set()
names = set()

for key in nodes:
    name = nodes[key]["name"]
    if name in names:
        repeats.add(name)
    else:
        names.add(name)

for name in repeats:
    for key in nodes:
        if nodes[key]["name"] == name:
            print(nodes[key])
    print()

{'name': 'ada', 'year': 1980, 'paradigm': ['Multi-paradigm'], 'typing': ['static', 'strong', 'safe', 'nominative']}
{'name': 'ada', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'apl', 'year': 1966, 'paradigm': ['Array', 'functional', 'structured', 'modular'], 'typing': ['dynamic']}
{'name': 'apl', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'oberon-2', 'year': 1991, 'paradigm': ['imperative', 'structured', 'modular', 'object-oriented'], 'typing': ['strong', 'static']}
{'name': 'oberon-2', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'io', 'year': 2002, 'paradigm': ['object-oriented', 'prototype-based'], 'typing': ['dynamic', 'strong']}
{'name': 'io', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'icon', 'year': 1977, 'paradigm': ['multi-paradigm', 'structured'], 'typing': ['dynamic']}
{'name': 'icon', 'year': 0, 'paradigm': [], 'typing': []}

{'name': 'groovy', 'year': 0, 'paradigm': [], 'typing': []}
{'name': 'groovy', 'year': 2003, 'paradigm': ['Object-oriente

In [11]:
double_entries = {}

for repeat in repeats:
    double_entries[repeat] = []

for key in nodes:
    lang = nodes[key]
    
    if lang["name"] in double_entries:
        entry = lang.copy()
        entry["id"] = key
        double_entries[lang["name"]].append(entry)

In [12]:
def merge_dicts(ds):
    acc = {'year': 0, 'paradigm':[], 'typing': []}
    
    d_scores = [0 for i in range(len(ds))]
    
    name = ds[0]['name']
    acc["name"] = name
    
    for i in range(len(ds)):
        if ds[i]["year"] != 0:
            acc["year"] = ds[i]['year']
            d_scores[i] += 1
        if len(ds[i]['paradigm']) != 0:
            acc["paradigm"] += ds[i]['paradigm']
            d_scores[i] += 1
        if len(ds[i]['typing']) != 0:
            acc["typing"] += ds[i]['typing']
            d_scores[i] += 1
    
    max_index = max(range(len(d_scores)), key=lambda x: d_scores[x])
    new_id = ds[max_index]['id']
    acc['id'] = new_id
    
    mapping = {}
    for d in ds:
        if d['id'] != new_id:
            mapping[d['id']] = new_id
    
    return acc, mapping

In [13]:
mapping = {}

for name in double_entries:
    acc, new_ids = merge_dicts(double_entries[name])
    nodes[acc['id']] = acc
    
    for key in new_ids:
        nodes.pop(key)
        mapping[key] = new_ids[key]

In [14]:
with open("Data/edges.txt") as f:
    raw_lines = f.readlines()
    processed_lines = [tuple(map(int, line.strip().split(" "))) for line in raw_lines]

In [15]:
mapped = [tuple(map(lambda x: mapping[x] if x in mapping else x, tup)) for tup in processed_lines]

with open("Data/edges.txt", "w") as f:
    for i, j in mapped:
        f.write(f"{i} {j}\n")

In [16]:
with open("Data/node_info.tsv", "w") as f:
    for key in nodes:
        d = nodes[key]
        f.write(f"{key}\t{d['name']}\t{d['year']}\t{d['paradigm']}\t{d['typing']}\n")

In [None]:
# reverse direction of edges because this is an citation network
with open("Data/edges.txt") as f:
    raw_lines = f.readlines()
    processed_lines = [tuple(map(int, line.strip().split(" "))) for line in raw_lines]

with open("Data/edges.txt", "w") as f:
    for i, j in processed_lines:
        f.write(f"{j} {i}\n")