<a href="https://colab.research.google.com/github/DestructionCatalyst/TouhouDialogueGenerator/blob/main/THWiki_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from bs4 import BeautifulSoup
import requests
import re

# Sample parsing

In [9]:
html_code = requests.get(
          f"https://raw.githubusercontent.com/DestructionCatalyst/TouhouWikiPages/main/TH60_1.htm"
      ).text # Get Reimu script from EoSD

In [47]:
soup = BeautifulSoup(html_code)

In [4]:
for match in soup.findAll('sup', {'class': 'reference'}):
    match.decompose() # Delete all references

In [5]:
tags_to_replace_with_children = ['a', 'p', 'b', 'i'] # All links, paragraphs and font types

In [6]:
for tag in tags_to_replace_with_children:
  for match in soup.findAll(tag):
    match.replaceWithChildren()

In [7]:
for match in soup.findAll('br'):
    match.replace_with(' ') # Merge paragraphs

In [8]:
tables = soup.find_all('table', {'class': 'wikitable'})

In [9]:
rows = list(map(lambda table: table.find_all('tr'), tables))

In [10]:
print(rows[0][0])

<tr>
<th>
</th>
<th lang="ja" style="width:45%">
夢幻夜行絵巻　～ <span lang="en">Mystic Flier</span>

</th>
<th style="width:55%">
Fantastic Night Parade Scroll ~ Mystic Flier

</th></tr>


In [11]:
print(rows[0][3])

<tr>
<th style="word-wrap: nowrap">
Reimu

</th>
<td lang="ja" width="45%">
気持ちいいわね
毎回、昼間に出発して悪霊が少ない から、夜に出てみたんだけど．．．
どこに行っていいかわからないわ 暗くて
でも．．．
夜の境内裏はロマンティックね （←のんき）

</td>
<td width="55%">
It sure feels great out.
There aren't many evil spirits about during the day, so I'm trying my luck at night...
But it's dark out, and I'm not sure where to go.
Still...
It's so romantic out behind the shrine at night. (← carefree)

</td></tr>


In [None]:
out_file = open('output.txt', mode='w')

In [None]:
for stage, table in enumerate(rows):
  for row in table:
    name_cell = row.find('th', {'style': 'word-wrap: nowrap'})
    ja_cell = row.find('td', {'lang': 'ja', 'width': '45%'})
    en_cell = row.find('td', {'width': '55%'})

    if name_cell and ja_cell and en_cell:
      out_file.write(f"{name_cell.text.strip()}:\n{en_cell.text.strip()}\n\n")

In [None]:
out_file.close()

In [None]:
!cat output.txt | head

Reimu:
It's been a while since my last job.

Reimu:
It sure feels great out.
There aren't many evil spirits about during the day, so I'm trying my luck at night...
But it's dark out, and I'm not sure where to go.
Still...
It's so romantic out behind the shrine at night. (← carefree)



# All this process as functions:

In [5]:
def extract_dialogue_from_row_classic(row): # for normal page format
  name_cell = row.find('th', {'style': 'word-wrap: nowrap'})
  ja_cell = row.find('td', {'lang': 'ja', 'width': '45%'})
  en_cell = row.find('td', {'width': '55%'})

  # If row has all the necessary elements in it
  if name_cell and ja_cell and en_cell:
    char_name = name_cell.text.strip()
    text = en_cell.text.strip()
    # Delete double whitespaces
    text = re.sub(r' +', ' ', text)
    
    return char_name, text

In [14]:
def extract_dialogue_from_row_different(row): # for deprecated page format
  header_cells = row.find_all('th')
  normal_cells = row.find_all('td')
  
  if len(header_cells) == 1 and len(normal_cells) == 2:
    char_name = header_cells[0].text.strip() # This will be character name
    text = normal_cells[1].text.strip() # This will be his line

    # Delete double whitespaces
    text = re.sub(r' +', ' ', text)
    
    return char_name, text

In [7]:
def parse_pages(input_page, output_file,
                tags_to_decompose=(('sup', {'class': 'reference'}),),
                tags_to_replace_with_children=(('a',), ('p',), ('b',), ('i',), ('span', )),
                tags_to_replace={('br',): ' '},
                ):
  """
  Parses a page of Touhou wiki in htm format to extract dialogue and puts it into a text file


  input_page is a string containing HTML code of the page to parse

  output_file is a file stream where the contents of the file will be written.\
  It must be opened for writing befor the function and closed after its usage

  tags_to_decompose is a tuple of pairs (2-item tuples),\
  in the format that is accepted by BeutifulSoup.find()\
  These tags will be decomposed (i.e. deleted with all their children)

  tags_to_replace_with_children is a tuple of pairs (2-item tuples),\
  in the format that is accepted by BeutifulSoup.find()\  
  These tags will be replaced with their children

  tags_to_replace is a dictionary, where key is a tag accepted by BeutifulSoup.find()\  
  These tags will be replaced with their values in the dictionary

  """

  # Read data
  soup = BeautifulSoup(input_page)

  # Preprocess
  for tag in tags_to_decompose:
    for match in soup.findAll(*tag):
      match.decompose()

  for tag in tags_to_replace_with_children:
    for match in soup.findAll(*tag):
      match.replaceWithChildren()

  for tag, replacement in tags_to_replace.items():
    for match in soup.findAll(*tag):
      match.replace_with(replacement)

  # Extract tables
  tables = soup.find_all('table', {'class':'wikitable'})

  if len(tables) != 0: # If page is in a classic format
    dialogue_extraction_function = extract_dialogue_from_row_classic
  else: 
    # Get tables in a deprecated format
    tables = soup.find_all('table', {'style': 'border-collapse: collapse'})
    dialogue_extraction_function = extract_dialogue_from_row_different

  # Extract rows
  rows = list(map(lambda table: table.find_all('tr'), tables))

  # For each row
  for table in rows:
    for row in table:
      # Extract dialogue line from the row
      dialogue = dialogue_extraction_function(row)

      if dialogue != None and dialogue[0] != '': # If the line contained dialogue
        # Write it to the output file
        out_file.write(f"{dialogue[0]}:\n{dialogue[1]}\n\n")


In [16]:
out_file = open('output1.txt', mode='w')

In [17]:
html_code = requests.get(
          f"https://raw.githubusercontent.com/DestructionCatalyst/TouhouWikiPages/main/TH60_1.htm"
      ).text # Get Reimu script from EoSD - classsic page sample

In [18]:
parse_pages(html_code, out_file)

In [19]:
html_code_2 = requests.get(
          f"https://raw.githubusercontent.com/DestructionCatalyst/TouhouWikiPages/main/TH120_6.htm"
      ).text # 

In [20]:
parse_pages(html_code_2, out_file)

In [21]:
out_file.close()

In [22]:
!cat output1.txt | head

Reimu:
It's been a while since my last job.

Reimu:
It sure feels great out.
There aren't many evil spirits about during the day, so I'm trying my luck at night...
But it's dark out, and I'm not sure where to go.
Still...
It's so romantic out behind the shrine at night. (← carefree)



In [23]:
!cat output1.txt | tail

However, I don't distinguish between youkai and gods.

Sanae:
The wholesome gods hunt the wicked youkai!

Byakuren:
Humans haven't changed since my days in the temple.
How self-righteous; you're an evil drag on us all! 
Now, namusan -- !



In [24]:
# Main games
games_numbers = list(range(60, 190, 10))

#Fightings and spinoffs
games_numbers.append(75)
games_numbers.append(105)
games_numbers.append(123)
games_numbers.append(125)
games_numbers.append(128)
games_numbers.append(135)
games_numbers.append(143)
games_numbers.append(145)
games_numbers.append(155)
games_numbers.append(165)
games_numbers.append(175)

games_numbers.sort()

print(games_numbers)

[60, 70, 75, 80, 90, 100, 105, 110, 120, 123, 125, 128, 130, 135, 140, 143, 145, 150, 155, 160, 165, 170, 175, 180]


In [25]:
def fetch_page(verbose=1):
  for game in games_numbers:
    page_text = ""
    i = 1

    if verbose:
      print(f"Now fetching Touhou {game / 10} dialogues")

    # While there are files for a given game with numbers in the consecutive order, fetch them.
    # When there are no files left (and 404 is returned), switch to the next game
    while page_text != "404: Not Found":
      page_text = requests.get(
          f"https://raw.githubusercontent.com/DestructionCatalyst/TouhouWikiPages/main/TH{game}_{i}.htm"
      ).text

      i += 1

      yield page_text

In [26]:
page_fetcher = fetch_page()

In [27]:
out_file = open('dataset.txt', mode='w')

In [28]:
for input_file in page_fetcher:
  parse_pages(input_file, out_file)

Now fetching Touhou 6.0 dialogues
Now fetching Touhou 7.0 dialogues
Now fetching Touhou 7.5 dialogues
Now fetching Touhou 8.0 dialogues
Now fetching Touhou 9.0 dialogues
Now fetching Touhou 10.0 dialogues
Now fetching Touhou 10.5 dialogues
Now fetching Touhou 11.0 dialogues
Now fetching Touhou 12.0 dialogues
Now fetching Touhou 12.3 dialogues
Now fetching Touhou 12.5 dialogues
Now fetching Touhou 12.8 dialogues
Now fetching Touhou 13.0 dialogues
Now fetching Touhou 13.5 dialogues
Now fetching Touhou 14.0 dialogues
Now fetching Touhou 14.3 dialogues
Now fetching Touhou 14.5 dialogues
Now fetching Touhou 15.0 dialogues
Now fetching Touhou 15.5 dialogues
Now fetching Touhou 16.0 dialogues
Now fetching Touhou 16.5 dialogues
Now fetching Touhou 17.0 dialogues
Now fetching Touhou 17.5 dialogues
Now fetching Touhou 18.0 dialogues


In [29]:
out_file.close()

In [30]:
!cat dataset.txt | head

Reimu:
It's been a while since my last job.

Reimu:
It sure feels great out.
There aren't many evil spirits about during the day, so I'm trying my luck at night...
But it's dark out, and I'm not sure where to go.
Still...
It's so romantic out behind the shrine at night. (← carefree)



In [31]:
!cat dataset.txt | tail

You're much more kind than your appearance lets on, Miss Momoyo.
I didn't think you'd honor your promise, judging from how you look.

Momoyo:
You pickin' a fight with me?
Perfect! I'll take you up on that offer! Time for a rematch, right now!

Sanae:
Ah, I'm sorry. Your looks weren't deceiving, after all.



In [32]:
ds = open("dataset.txt")

In [33]:
script = ds.read()

In [34]:
ds.close()

In [35]:
len(script)

1335259

In [113]:
lines = script.split('\n\n')

In [41]:
len(lines)

15166

In [116]:
lines = list(map(lambda s: s.split(':\n'), lines))

In [42]:
lines[0]

['Reimu', "It's been a while since my last job."]

In [132]:
def lines_cleanup(line):
  if len(line) == 3:
    return line[0], line[1] + line[2]
  elif len(line) < 2:
    return None
  else:
    return line

In [133]:
lines = map(lines_cleanup, lines)
lines = list(filter(lambda a: a != None, lines))

In [50]:
from functools import reduce

In [156]:
def phrases(name):
  return len(list(filter(lambda a: a[0] == name,
                          lines)
  ))

In [136]:
def words(name):
  return len(
    list(reduce(lambda a, b: a + b,
            map(lambda line: re.findall(r'(?u)\b\w\w+\b', line[1]),
                filter(lambda a: a[0] == name,
                            lines)
                )
            )
    )
  )

In [137]:
def characters(name):
  return reduce(lambda a, b: a + b,
            map(lambda line: len(line[1]),
                filter(lambda a: a[0] == name,
                            lines)
                )
          
                )
  

In [59]:
import pandas as pd

In [138]:
df = pd.DataFrame(columns=['Name', 'Phrases', 'Words', 'Characters'])

In [139]:
df['Name'] = ['Reimu', 'Marisa', 'Sakuya', 'Youmu', 'Sanae', 'Aya']

In [158]:
df['Phrases'] = df['Name'].apply(phrases)

In [157]:
df['Words'] = df['Name'].apply(words)

In [145]:
df['Characters'] = df['Name'].apply(characters)

In [148]:
df['Verbosity'] = df.apply(lambda row: row['Words'] / row['Phrases'], axis=1)

In [159]:
df.head(6)

Unnamed: 0,Name,Phrases,Words,Characters,Verbosity
0,Reimu,1771,21276,120379,12.013552
1,Marisa,1806,21980,124152,12.170543
2,Sakuya,577,6663,38104,11.54766
3,Youmu,472,6077,34734,12.875
4,Sanae,449,6190,35632,13.786192
5,Aya,210,3591,20805,17.1


In [215]:
word_counts = {}

In [None]:
lines.insert(0, '')

In [191]:
text = reduce(lambda a, b: a + ' ' + b[1], lines)

In [192]:
text



In [216]:
for word in re.findall(r'(?u)\b\w\w+\b', text.lower()):
  word_counts.setdefault(word, 0)
  word_counts[word] += 1 

In [None]:
word_counts

In [209]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [229]:
word_counts_no_stop = dict(filter(lambda pair: not pair[0] in stopwords.words('english'), word_counts.items()))

In [231]:
sorted(word_counts_no_stop.items(), key=lambda pair: pair[1], reverse=True)

[('like', 1370),
 ('well', 1013),
 ('right', 885),
 ('one', 750),
 ('human', 745),
 ('oh', 713),
 ('know', 674),
 ('see', 672),
 ('really', 658),
 ('even', 624),
 ('time', 591),
 ('get', 574),
 ('youkai', 563),
 ('go', 546),
 ('way', 536),
 ('huh', 488),
 ('something', 480),
 ('would', 468),
 ('think', 468),
 ('world', 465),
 ('power', 461),
 ('let', 460),
 ('come', 439),
 ('going', 417),
 ('back', 414),
 ('humans', 410),
 ('got', 382),
 ('want', 380),
 ('moon', 379),
 ('people', 364),
 ('shrine', 359),
 ('good', 356),
 ('could', 353),
 ('place', 334),
 ('say', 324),
 ('take', 316),
 ('hey', 314),
 ('sure', 308),
 ('around', 302),
 ('spirits', 296),
 ('us', 296),
 ('though', 293),
 ('god', 290),
 ('gensokyo', 283),
 ('ah', 281),
 ('much', 271),
 ('still', 269),
 ('strong', 265),
 ('thought', 265),
 ('hell', 264),
 ('never', 263),
 ('mean', 255),
 ('look', 253),
 ('yes', 253),
 ('dream', 253),
 ('long', 246),
 ('make', 246),
 ('already', 245),
 ('spirit', 238),
 ('someone', 237),
 ('fig