In [434]:
import pandas as pd
import sqlite3

conn = sqlite3.connect('chinese-idioms-12976.db')
df = pd.read_sql_query("SELECT * FROM idiom", conn)
df_edit = df.copy()
ListInitials= ['b','p','m','f','d','t','n','l','g','k','h','j','q','x','zh','ch','sh','r','z','c','s','y','w']
ListFinals = ['a','o','e','i','u','v','ai','ei','ui','ao','ou','iu','ie','ve','er','an','en','in','un','vn','ang','eng','ing','ong']
List1Tone = ["ā","ē","ī","ō","ū","ǖ"]
List2Tone = ["á","é","í","ó","ú","ǘ"]
List3Tone = ["ǎ","ě","ǐ","ǒ","ǔ","ǚ"]
List4Tone = ["à","è","ì","ò","ù"]

df_edit['initials_py1']=df_edit['py1'].str.extract(r'(^zh|^ch|^sh|^[bpmfdtnlgkhjqxrcsywz])', expand=False)
df_edit['finals_py1']=df_edit['py1'].str.extract(r'(ang$|eng$|ing$|ong$|ai$|ei$|ui$|ao$|ou$|iu$|ie$|ve$|er$|an$|en$|in$|un$|vn$|a$|o$|e$|i$|u$|v$)', expand=False)
df_edit['initials_py2']=df_edit['py2'].str.extract(r'(^zh|^ch|^sh|^[bpmfdtnlgkhjqxrcsywz])', expand=False)
df_edit['finals_py2']=df_edit['py2'].str.extract(r'(ang$|eng$|ing$|ong$|ai$|ei$|ui$|ao$|ou$|iu$|ie$|ve$|er$|an$|en$|in$|un$|vn$|a$|o$|e$|i$|u$|v$)', expand=False)
df_edit['initials_py3']=df_edit['py3'].str.extract(r'(^zh|^ch|^sh|^[bpmfdtnlgkhjqxrcsywz])', expand=False)
df_edit['finals_py3']=df_edit['py3'].str.extract(r'(ang$|eng$|ing$|ong$|ai$|ei$|ui$|ao$|ou$|iu$|ie$|ve$|er$|an$|en$|in$|un$|vn$|a$|o$|e$|i$|u$|v$)', expand=False)
df_edit['initials_py4']=df_edit['py4'].str.extract(r'(^zh|^ch|^sh|^[bpmfdtnlgkhjqxrcsywz])', expand=False)
df_edit['finals_py4']=df_edit['py4'].str.extract(r'(ang$|eng$|ing$|ong$|ai$|ei$|ui$|ao$|ou$|iu$|ie$|ve$|er$|an$|en$|in$|un$|vn$|a$|o$|e$|i$|u$|v$)', expand=False)

for i in range(1,5):
    tone_col = f'pytone{i}'

    mask1 = df_edit[tone_col].str.contains('|'.join(List1Tone), na=False, regex=True)
    mask2 = df_edit[tone_col].str.contains('|'.join(List2Tone), na=False, regex=True)
    mask3 = df_edit[tone_col].str.contains('|'.join(List3Tone), na=False, regex=True)
    mask4 = df_edit[tone_col].str.contains('|'.join(List4Tone), na=False, regex=True)
    df_edit.loc[mask1, tone_col] = 1
    df_edit.loc[mask2, tone_col] = 2
    df_edit.loc[mask3, tone_col] = 3
    df_edit.loc[mask4, tone_col] = 4
    


def IdentifyInitialsAndFinals(String):
    parts = String.split('_')
    temp_df = pd.DataFrame(parts, columns=['input'])
    temp_df['initials'] = temp_df['input'].str.extract(r'(^zh|^ch|^sh|^[bpmfdtnlgkhjqxrcsywz])', expand=False)
    temp_df['finals'] = temp_df['input'].str.extract(r'(ang$|eng$|ing$|ong$|ai$|ei$|ui$|ao$|ou$|iu$|ie$|ve$|er$|an$|en$|in$|un$|vn$|a$|o$|e$|i$|u$|v$)', expand=False)
    result = list(zip(temp_df['initials'], temp_df['finals']))
    return result

def GuessIdiom(String:str, StringOfNones:str=None, StringOfTones:str =None, StringOfNoneTones:str=None):
    parts = String.split('_')
    matched = df_edit.copy()
    
    #Remove Tones First
    if StringOfNoneTones:
        ToneParts = StringOfNoneTones.split('_')
        for i, tone_input in enumerate(ToneParts):
            if not tone_input:
                continue
            else:
                tone_name = f'pytone{i+1}'
                matched = matched[matched[tone_name]!=int(tone_input)]

    #Match Tones
    if StringOfTones:
        ToneParts = StringOfTones.split('_')
        for i, tone_input in enumerate(ToneParts):
            if not tone_input:
                continue
            if tone_input.startswith('*'):
                tone_input = tone_input[1:]  # remove the *
                tone_name = f'pytone{i+1}'
                matched = matched[matched[tone_name]!=int(tone_input)]
            else:
                tone_name = f'pytone{i+1}'
                matched = matched[matched[tone_name]==int(tone_input)]

    #Remove Non-Matches 
    if StringOfNones:
        for InitAndFin in IdentifyInitialsAndFinals(StringOfNones):
            NonInitials, NonFinals = InitAndFin
            for i in range(1, 5):
              col_name_initial = f'initials_py{i}'
              col_name_final = f'finals_py{i}'
              matched = matched[(matched[col_name_initial]!= NonInitials) & (matched[col_name_final]!= NonFinals)]

    #Search
    for i, input in enumerate(parts):
        if not input:
            continue
        else:
            initials_name = f'initials_py{i+1}'
            finals_name = f'finals_py{i+1}'
            Initial, Final = IdentifyInitialsAndFinals(input)[0]
            
            if input.startswith('*'):
                input = input[1:]  # remove the *
                # Check if Initial and Final are not None before using str.contains
                if pd.notna(Initial):
                    matched = matched[matched[initials_name]!=Initial]
                if pd.notna(Final):
                    matched = matched[matched[finals_name]!=Final]
            else:
                # Check if Initial and Final are not None before using str.contains
                if pd.notna(Initial):
                    matched = matched[matched[initials_name]==Initial]
                if pd.notna(Final):
                    matched = matched[matched[finals_name]==Final]

    matched = matched.head(20)
    MatchedIdiom = matched[['char1','char2','char3','char4']].agg(''.join, axis=1)
    print(MatchedIdiom.values)
    return


In [480]:
String = 'j_i_f_'
Nones = 'y_ch_a_eng_d_ch_e_k_eng_g_ong_uan_zh_h_ao_an'
Tones = '__2_1'
NoTones = ''

GuessIdiom(String,Nones,Tones,NoTones)

['救死扶伤']


1