In [1]:
NAME = "Dustin Seltz"

Purpose:

Scrape frequency data of different sources from https://scriptin.github.io/kanji-frequency/

Input:

None. 


Output:

KanjiFrequencyOnWikipedia.csv

KanjiFrequencyOnNews.csv

KanjiFrequencyOnTwitter.csv

KanjiFrequencyOnAozora.csv


In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen

import re

import numpy as np
import pandas as pd

In [3]:
#There's probably an easier way to load data from JSON, 
#    but I could use more practice with web scraping anyway. 
#(I scraped this data before we covered anything about JSON)
sourceNames = ["Aozora", "News", "Twitter", "Wikipedia"]
urlsToScrape = []
for name in sourceNames:
    name = name.lower()
    urlsToScrape.append("https://raw.githubusercontent.com/scriptin/kanji-frequency/master/data/"+name+".json")

In [4]:
htmls = [urlopen(urlToScrape) for urlToScrape in urlsToScrape]
soups = [BeautifulSoup(html, 'lxml') for html in htmls]

In [5]:
wikiSoup = soups[3]
wikiSoup

<html><body><p>[["all",784564523,1],
["年",21066593,0.02685131991368414],
["日",14316719,0.01824798162585284],
["月",11002406,0.014023583373269607],
["大",7076058,0.00901908994424414],
["本",6655271,0.008482758020400574],
["学",6065948,0.0077316113871745895],
["人",5767744,0.007351522826886731],
["国",5088612,0.006485906322328061],
["中",4464258,0.005690109441769902],
["一",4305603,0.0054878889801648605],
["会",4165137,0.005308852080225911],
["出",4161682,0.005304448363388463],
["市",4008259,0.005108896569364761],
["者",3980752,0.005073836355458046],
["作",3943028,0.005025753630718273],
["名",3806693,0.004851982072097848],
["部",3787829,0.004827938160543107],
["用",3498792,0.004459533789039299],
["行",3482563,0.004438848428531352],
["地",3466852,0.004418823306900917],
["道",3302362,0.004209165598480675],
["場",3293712,0.004198140373981708],
["上",3282573,0.004183942688930378],
["合",3280116,0.004180811015335701],
["生",3196970,0.004074833753348289],
["田",3187908,0.00406328339676914],
["県",3152929,0.00401869942

In [6]:
#Testing regex on this stuff
str1 = """["年",21066593,0.02685131991368414],"""
expr = """\["(.)",(.*),(.*)\]"""
match = re.match(expr, str1)
if(match):
    print(match.group())
    print(match.group(1))
    print(match.group(2))
    print(match.group(3))

["年",21066593,0.02685131991368414]
年
21066593
0.02685131991368414


In [7]:
#Scrape from the soup
expr = """\["(.)",(.*),(.*)\]"""
paragraph = wikiSoup.find_all("p")[0].text
paragraphLines = paragraph.splitlines()

In [8]:
matches = [re.findall(expr, line) for line in paragraphLines]
matches

[[],
 [('年', '21066593', '0.02685131991368414')],
 [('日', '14316719', '0.01824798162585284')],
 [('月', '11002406', '0.014023583373269607')],
 [('大', '7076058', '0.00901908994424414')],
 [('本', '6655271', '0.008482758020400574')],
 [('学', '6065948', '0.0077316113871745895')],
 [('人', '5767744', '0.007351522826886731')],
 [('国', '5088612', '0.006485906322328061')],
 [('中', '4464258', '0.005690109441769902')],
 [('一', '4305603', '0.0054878889801648605')],
 [('会', '4165137', '0.005308852080225911')],
 [('出', '4161682', '0.005304448363388463')],
 [('市', '4008259', '0.005108896569364761')],
 [('者', '3980752', '0.005073836355458046')],
 [('作', '3943028', '0.005025753630718273')],
 [('名', '3806693', '0.004851982072097848')],
 [('部', '3787829', '0.004827938160543107')],
 [('用', '3498792', '0.004459533789039299')],
 [('行', '3482563', '0.004438848428531352')],
 [('地', '3466852', '0.004418823306900917')],
 [('道', '3302362', '0.004209165598480675')],
 [('場', '3293712', '0.004198140373981708')],
 [(

In [9]:
#Turn the result into something we can easily make into a dataframe
matchesList = []
for entry in matches:
    #Each entry is a list of length 1. 
    for tup in entry:
        if(len(tup) == 3):
            character = tup[0]
            numberOfAppearances = tup[1]
            percentage = tup[2]
            matchesList.append([character, numberOfAppearances, percentage])
matchesList

[['年', '21066593', '0.02685131991368414'],
 ['日', '14316719', '0.01824798162585284'],
 ['月', '11002406', '0.014023583373269607'],
 ['大', '7076058', '0.00901908994424414'],
 ['本', '6655271', '0.008482758020400574'],
 ['学', '6065948', '0.0077316113871745895'],
 ['人', '5767744', '0.007351522826886731'],
 ['国', '5088612', '0.006485906322328061'],
 ['中', '4464258', '0.005690109441769902'],
 ['一', '4305603', '0.0054878889801648605'],
 ['会', '4165137', '0.005308852080225911'],
 ['出', '4161682', '0.005304448363388463'],
 ['市', '4008259', '0.005108896569364761'],
 ['者', '3980752', '0.005073836355458046'],
 ['作', '3943028', '0.005025753630718273'],
 ['名', '3806693', '0.004851982072097848'],
 ['部', '3787829', '0.004827938160543107'],
 ['用', '3498792', '0.004459533789039299'],
 ['行', '3482563', '0.004438848428531352'],
 ['地', '3466852', '0.004418823306900917'],
 ['道', '3302362', '0.004209165598480675'],
 ['場', '3293712', '0.004198140373981708'],
 ['上', '3282573', '0.004183942688930378'],
 ['合', '3

In [10]:
#Store the data in a dataframe
colNames = ["Character", "Number of Appearances", "%"]
df = pd.DataFrame(matchesList, columns=colNames)
df

Unnamed: 0,Character,Number of Appearances,%
0,年,21066593,0.02685131991368414
1,日,14316719,0.01824798162585284
2,月,11002406,0.014023583373269607
3,大,7076058,0.00901908994424414
4,本,6655271,0.008482758020400574
5,学,6065948,0.0077316113871745895
6,人,5767744,0.007351522826886731
7,国,5088612,0.006485906322328061
8,中,4464258,0.005690109441769902
9,一,4305603,0.0054878889801648605


In [11]:
#Store the dataframe in a file
file_name = "KanjiFrequencyOnWikipedia"
df.to_csv(file_name, index=False)

In [12]:
#Test that it worked
df = pd.read_csv(file_name)
df

Unnamed: 0,Character,Number of Appearances,%
0,年,21066593,0.02685131991368414
1,日,14316719,0.01824798162585284
2,月,11002406,0.014023583373269607
3,大,7076058,0.00901908994424414
4,本,6655271,0.008482758020400574
5,学,6065948,0.0077316113871745895
6,人,5767744,0.007351522826886731
7,国,5088612,0.006485906322328061
8,中,4464258,0.005690109441769902
9,一,4305603,0.0054878889801648605


In [13]:
#Looks good, but I want to make all four output files at once. Lets do all that stuff but in a loop.
for (soup,name) in zip(soups, sourceNames):
    #Scrape from the soup
    expr = """\["(.)",(.*),(.*)\]"""
    paragraph = soup.find_all("p")[0].text
    paragraphLines = paragraph.splitlines()
    matches = [re.findall(expr, line) for line in paragraphLines]
    #Turn the result into something we can easily make into a dataframe
    matchesList = []
    for entry in matches:
        #Each entry is a list of length 1. 
        for tup in entry:
            if(len(tup) == 3):
                character = tup[0]
                numberOfAppearances = tup[1]
                percentage = tup[2]
                matchesList.append([character, numberOfAppearances, percentage])
    #Store the data in a dataframe
    colNames = ["Character", "Number of Appearances", "%"]
    df = pd.DataFrame(matchesList, columns=colNames)
    #Store the dataframe in a file
    file_name = "KanjiFrequencyOn"+name
    df.to_csv(file_name, index=False)