# Clean old WKCM data
Removes all HTML tags (not generalized, only the ones in the sheet at the time)   
Replaces highlighting tags with custom markup tags  
Replaces users c and ript:void(0) caused by bugs in the old version with Anonymous  
Replaces old delimiter (pipe: |) by obscure Group separator control character  

In [175]:
import pandas as pd
import requests

In [176]:
# "real" Group Separator in the middle, visual GS  on the outside 
# (GS will just be removed in JS, its just for visual differentiation on google sheets)
# print(u'\u001D')
# print(u'\u241D')
separator = "␝␝"

In [168]:
# Download data as csv by hand and read
df = pd.read_csv("Documents/JavaScript/WKCM2/data.csv", sep=',')

In [169]:
# Delete emty and unnecesarry columns
for i in range(9, 26):
    col = "Unnamed: " + str(i)
    df.drop(columns=col, inplace=True)
df.drop(columns="Info", inplace=True)

In [170]:
# cleanup functions
def update_users(x):
    if type(x) is float:
        return ""
    users = x.split("|")
    #print(users)
    for idx in range(len(users)):
        if users[idx] == "c" or users[idx] == "ript:void(0)":
            users[idx] = "Anonymous"
    x = separator.join(users)
    return x

def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def update_mnem(x):
    if type(x) is float:
        return ""
    
    x = x.replace('</span>', '[/span]')
    x = x.replace('<span class="highlight-kanji">', '[kan]')
    x = x.replace('<span class="kanji-highlight">', '[kan]')
    x = x.replace('<span class="highlight-vocabulary">', '[voc]')
    x = x.replace('<span class="vocabulary-highlight">', '[voc]')
    x = x.replace('<span class="highlight-radical">', '[rad]')
    x = x.replace('<span class="radical-highlight">', '[rad]')
    x = x.replace('<span class="highlight-reading">', '[read]')
    x = x.replace('<span class="reading-highlight">', '[read]')
    
    x = x.replace("<b>", "[b]")
    x = x.replace("</b>", "[/b]")
    x = x.replace("<i>", "[i]")
    x = x.replace("</i>", "[/i]")
    x = x.replace("<u>", "[u]")
    x = x.replace("</u>", "[/u]")
    x = x.replace("<s>", "[s]")
    x = x.replace("</s>", "[/s]")
    # x = x.replace("<img", "[img")
    x = x.replace("<br>", "\n")
    x = x.replace("<br/>", "\n")
    x = x.replace("</br>", "\n")
    x = x.replace("<br />", "\n")
    
    # only replaces the first img tag. All after that are deleted
    img_content = x[x.find(start:='<img')+len(start):x.find('>')]
    pattern = "\<img(.*?)\>"
    substring = re.search(pattern, x)
    if substring:
        substring = substring.group(1)
        x = re.sub('\<img(.*?)\>', '[img asdkljfasi]', x)
        x = x.replace("asdkljfasi", substring)
    
    x = remove_html_tags(x)
    
    mnems = x.split("|")
    x = separator.join(mnems)
    return x


In [171]:
# actual cleanup of columns
df["Meaning_User"] = df["Meaning_User"].apply(update_users)
df["Reading_User"] = df["Reading_User"].apply(update_users)

df["Reading_Mnem"] = df["Reading_Mnem"].apply(update_mnem)
df["Meaning_Mnem"] = df["Meaning_Mnem"].apply(update_mnem)

## Upload to google sheet
requires a google sheets api access and service account credentials in *.json

In [172]:

import gspread
from gspread_dataframe import set_with_dataframe

In [173]:
# ACCES GOOGLE SHEET
gc = gspread.service_account(filename='Documents/JavaScript/WKCM2/wkcm2-332109-975b83d32a97.json')
sh = gc.open_by_key('13oZkp8eS059nxsYc6fOJNC3PjXVnFvUC8ntRt8fdoCs')
worksheet = sh.get_worksheet(1) #-> 0 - first sheet, 1 - second sheet etc. 

In [174]:
# APPEND DATA TO SHEET
#your_dataframe = pd.DataFrame()
set_with_dataframe(worksheet, df) #-> THIS EXPORTS YOUR DATAFRAME TO THE GOOGLE SHEET