# Clean AI Wiki Links

In [4]:
import pandas as pd
import numpy as np
import json
import glob
import time


# web scrapping
import requests as r
from bs4 import BeautifulSoup
import re
import bs4 as bs
import urllib

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

#spacy
import spacy
import nltk
from nltk.corpus import stopwords
#spacy.load("en_core_web_sm")

#visualization
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import warnings
#warnings.simplefilter('always')
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Read links data

In [53]:
links = pd.read_csv("/home/zz3hs/git/dspg21RnD/data/dspg21RnD/wiki_ai_links.csv") #import csv
links = links[links['title'].notnull()] #exclude rows that are na for title
links.title= links.title.str.lower() #lowercase title

links = links.rename(columns={'title': 'text'})
len(links)

1576

In [35]:
links.head()

Unnamed: 0,url,text
1,/wiki/AI_(disambiguation),ai (disambiguation)
2,/wiki/Artificial_intelligence_(disambiguation),artificial intelligence (disambiguation)
3,/wiki/Outline_of_artificial_intelligence,artificial intelligence
4,/wiki/Artificial_intelligence#Goals,major goals
5,/wiki/Artificial_general_intelligence,artificial general intelligence


## De-duplicate Links

TODO: There are some same text but different links ones.

In [55]:
# no entry that have dupilicates on both url and text
links_dedup = links.drop_duplicates()
len(links_dedup)

1576

## Read named entity data

In [42]:
named_entity = pd.read_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/ai_wiki_text_entity.csv') 
named_entity = named_entity[["text", "type"]]
named_entity.text= named_entity.text.str.lower() #lowercase title
named_entity = named_entity.rename(columns={'type': 'ner'})

In [43]:
named_entity

Unnamed: 0,text,ner
0,tesler,PERSON
1,1955,DATE
2,alpha,PERSON
3,2015,DATE
4,agi,ORG
...,...,...
390,karel čapek's r.u.r.,WORK_OF_ART
391,a.i. artificial intelligence and ex machina,WORK_OF_ART
392,well as the novel do androids dream of electri...,WORK_OF_ART
393,philip k. dick,PERSON


## Join two dataframes

In [44]:
links_ne = pd.merge(links,named_entity,on='text',how='left')

In [45]:
links_ne

Unnamed: 0,url,text,ner
0,/wiki/AI_(disambiguation),ai (disambiguation),
1,/wiki/Artificial_intelligence_(disambiguation),artificial intelligence (disambiguation),
2,/wiki/Outline_of_artificial_intelligence,artificial intelligence,
3,/wiki/Artificial_intelligence#Goals,major goals,
4,/wiki/Artificial_general_intelligence,artificial general intelligence,
...,...,...,...
1593,/wiki/Wikipedia:File_Upload_Wizard,upload file,
1594,/wiki/Special:WhatLinksHere/Artificial_intelli...,what links here,
1595,/wiki/Special:RecentChangesLinked/Artificial_i...,related changes,
1596,/wiki/Special:SpecialPages,special pages,


## Write Links data with NER

In [46]:
#links_ne.to_csv(r'/home/zz3hs/git/dspg21RnD/data/dspg21RnD/wiki_ai_links_ner.csv', index = False)   

## Examine NER text

In [40]:
links_matched = links_ne[links_ne['type'].notnull()] #exclude rows that are na for title

In [48]:
links_matched.text.to_list()

['ethics',
 'karel čapek',
 'mccullouch',
 'dartmouth college',
 'john mccarthy',
 'john mccarthy',
 'john mccarthy',
 'norbert wiener',
 'allen newell',
 'cmu',
 'herbert simon',
 'herbert simon',
 'herbert simon',
 'mit',
 'mit',
 'marvin minsky',
 'marvin minsky',
 'marvin minsky',
 'arthur samuel',
 'ibm',
 'ibm',
 'garry kasparov',
 'watson',
 'brad rutter',
 'ken jennings',
 'kinect',
 'xbox 360',
 'lee sedol',
 'future of go summit',
 'ke jie',
 'murray campbell',
 'alphazero',
 'muzero',
 'china',
 'china',
 'china',
 'denver',
 'san francisco',
 'svm',
 'cyc',
 'hans moravec',
 'hans moravec',
 'deepmind',
 'deepmind',
 'atari',
 'princeton university',
 'stanford',
 'stanford',
 'mit',
 'mit',
 'john haugeland',
 'gofai',
 'gofai',
 'gofai',
 'gofai',
 'stanford',
 'stanford',
 'seymour papert',
 'roger schank',
 'rodney brooks',
 'rodney brooks',
 'david rumelhart',
 'google search',
 'siri',
 'deepfakes',
 'ray kurzweil',
 'ray kurzweil',
 'joseph weizenbaum',
 'computer po

## Cleaned links

In [63]:
links_valid = links_ne[links_ne['ner'].isna()] 

In [64]:
links_valid

Unnamed: 0,url,text,ner
0,/wiki/AI_(disambiguation),ai (disambiguation),
1,/wiki/Artificial_intelligence_(disambiguation),artificial intelligence (disambiguation),
2,/wiki/Outline_of_artificial_intelligence,artificial intelligence,
3,/wiki/Artificial_intelligence#Goals,major goals,
4,/wiki/Artificial_general_intelligence,artificial general intelligence,
...,...,...,...
1593,/wiki/Wikipedia:File_Upload_Wizard,upload file,
1594,/wiki/Special:WhatLinksHere/Artificial_intelli...,what links here,
1595,/wiki/Special:RecentChangesLinked/Artificial_i...,related changes,
1596,/wiki/Special:SpecialPages,special pages,
