In [7]:
import recon
import numpy as np
from ds import DialectSim as ds
from sklearn.manifold import spectral_embedding
from gtda.plotting import plot_heatmap
import networkx as nx
import random
import pandas as pd
import logging
def clean_data(data):
    '''清洗方言字音数据中的录入错误'''

    ipa = '0A-Za-z\u00c0-\u03ff\u1d00-\u1dbf\u1e00-\u1eff\u2205\u2c60-\u2c7f' \
        + '\ua720-\ua7ff\uab30-\uab6f\ufb00-\ufb4f\ufffb' \
        + '\U00010780-\U000107ba\U0001df00-\U0001df1e'

    clean = data.copy()

    initials = [c for c in data.columns if c.endswith('聲母')]
    finals = [c for c in data.columns if c.endswith('韻母')]
    tones = [c for c in data.columns if c.endswith('調')]

    for initial in initials:
        # 有些符号使用了多种写法，统一成较常用的一种
        clean[initial] = clean[initial].fillna('nan').str.lower() \
            .str.replace(f'[^{ipa}]', 'nan') \
            .str.replace('[\u00f8\u01ff]', '\u2205') \
            .str.replace('\ufffb', 'nan') \
            .str.replace('\u02a3', 'dz') \
            .str.replace('\u02a4', 'dʒ') \
            .str.replace('\u02a5', 'dʑ') \
            .str.replace('\u02a6', 'ts') \
            .str.replace('\u02a7', 'tʃ') \
            .str.replace('\u02a8', 'tɕ') \
            .str.replace('[\u02b0\u02b1]', 'h') \
            .str.replace('g', 'ɡ')

        mask = clean[initial] != data[initial]
        if np.count_nonzero(mask):
            for (r, c), cnt in pd.DataFrame({
                'raw': data.loc[mask, initial],
                'clean': clean.loc[mask, initial]
            }).value_counts().iteritems():
                logging.warning(f'replace {r} -> {c} {cnt}')

    for final in finals:
        clean[final] = clean[final].fillna('nan').str.lower() \
            .str.replace(f'[^{ipa}]', 'nan')

        mask = clean[final] != data[final]
        if np.count_nonzero(mask):
            for (r, c), cnt in pd.DataFrame({
                'raw': data.loc[mask, final],
                'clean': clean.loc[mask, final]
            }).value_counts().iteritems():
                logging.warning(f'replace {r} -> {c} {cnt}')
                
    for tone in tones:
        # 部分声调被错误转为日期格式，还原成数字
        clean[tone].fillna('nan')
        mask = clean[tone].str.match(r'^\d+年\d+月\d+日$')
        clean.loc[mask, tone] = pd.to_datetime(
            clean.loc[mask, tone],
            format=r'%Y年%m月%d日'
        ).dt.dayofyear.astype(str)
        clean.loc[~mask, tone] = clean.loc[~mask, tone].str.lower() \
            .str.replace(r'[^1-5]', 'nan')

        mask = clean[tone] != data[tone]
        if np.count_nonzero(mask):
            for (r, c), cnt in pd.DataFrame({
                'raw': data.loc[mask, tone],
                'clean': clean.loc[mask, tone]
            }).value_counts().iteritems():
                logging.warning(f'replace {r} -> {c} {cnt}')

    return clean

input_file = './data/xiangyu.csv'

data = recon.clean(recon.load(input_file))

data = clean_data(data)

for c in data.columns:
    if c.partition('_')[2] in ('聲母', '韻母', '調值'):
        data[c] = data[c].astype('category')

data

  clean[initial] = clean[initial].fillna('nan').str.lower() \
  clean[final] = clean[final].fillna('nan').str.lower() \


Unnamed: 0,id,長沙_聲母,長沙_韻母,長沙_調值,長沙_調類,雙峰_聲母,雙峰_韻母,雙峰_調值,雙峰_調類,全州(縣城)_聲母,全州(縣城)_韻母,全州(縣城)_調值,全州(縣城)_調類,灌陽(文市)_聲母,灌陽(文市)_韻母,灌陽(文市)_調值,灌陽(文市)_調類
0,1,t,ən,33,陰平,t,an,55,陰平,t,oŋ,33,陰平,t,uŋ,22,陰平
1,2,,,,,,,,,t,oŋ,33,陰平,t,uŋ,22,陰平
2,18,t,ən,13,陽平,d,an,13,陽平,d,oŋ,23,陽平,d,uŋ,33,陽平
3,19,,,,,,,,,d,oŋ,23,陽平,d,uŋ,33,陽平
4,20,t,ən,13,陽平,d,an,13,陽平,d,oŋ,23,陽平,d,uŋ,33,陽平
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5160,25496,,,,,,,,,,,,,0,ie,33,陽平
5161,25497,,,,,,,,,,,,,0,ie,33,陽平
5162,25517,,f,,,,,,ua,0,ua,23,陽平,x,ua,33,陽平
5163,25520,f,a,24,入,x,ua,13,陽平,x,ua,33,陰平,x,ua,33,陽平


In [78]:
initials = [c for c in data.columns if c.endswith('聲母')]
finals = [c for c in data.columns if c.endswith('韻母')]
tones = [c for c in data.columns if c.endswith('調值')]

In [79]:
relevant = initials+finals+tones
relevant

['長沙_聲母',
 '雙峰_聲母',
 '全州(縣城)_聲母',
 '灌陽(文市)_聲母',
 '長沙_韻母',
 '雙峰_韻母',
 '全州(縣城)_韻母',
 '灌陽(文市)_韻母',
 '長沙_調值',
 '雙峰_調值',
 '全州(縣城)_調值',
 '灌陽(文市)_調值']

In [80]:
columns = [c for c in data.columns if c.endswith('聲母')]
categories = [t.categories for t in data.dtypes[columns]]
categories

[Index(['0', 'a', 'f', 'k', 'kh', 'l', 'm', 'nan', 'p', 'ph', 's', 't', 'th',
        'ts', 'tsh', 'tɕ', 'tɕh', 'x', 'z', 'ŋ', 'ȵ', 'ɕ'],
       dtype='object'),
 Index(['0', 'b', 'd', 'dz', 'dʐ', 'dʑ', 'k', 'kh', 'l', 'm', 'nan', 'p', 'ph',
        's', 't', 'th', 'ts', 'tsh', 'tɕ', 'tɕh', 'tʂ', 'tʂh', 'x', 'ŋ', 'ȵ',
        'ɕ', 'ɡ', 'ɣ', 'ʂ'],
       dtype='object'),
 Index(['0', 'b', 'd', 'dz', 'dʑ', 'dʑh', 'f', 'h', 'k', 'kh', 'l', 'm', 'nan',
        'p', 'ph', 's', 't', 'th', 'ts', 'tsh', 'tɕ', 'tɕh', 'tʃ', 'tʃh', 'x',
        'z', 'ɕ', 'ɡ', 'ʑ'],
       dtype='object'),
 Index(['0', 'b', 'd', 'dz', 'f', 'h', 'k', 'kh', 'l', 'm', 'n', 'nan', 'p',
        'ph', 's', 't', 'th', 'ts', 'tsh', 'tɕ', 'tʃ', 'x', 'z', 'ɡ', 'ɬ', 'ʃ',
        'ʔ'],
       dtype='object')]

In [82]:
for t in data.dtypes[columns]:
    print(t)

category
category
category
category


In [7]:
#data = data.set_index('id')
data = data.set_index('id')
out = './xiangyu/train.txt'
f = open(out, 'w', encoding='utf8')
g = open('./xiangyu/valid.txt','w',encoding='utf8')
k = open('./xiangyu/test.txt','w',encoding='utf8')
v = open('./xiangyu/valid2.txt','w',encoding='utf8')
data = data.groupby(data.index).first()
data = data.drop_duplicates()
for i in data.index:
    a = data.loc[i]
    trainflags = [True,True,True]
    for j in a.index:           
        if not j.endswith('調類'):
            if not pd.isnull(a[j]) and not a[j]=='nan':
                if j.endswith('聲母') and trainflags[0]:
                    f.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                    trainflags[0] = False
                elif j.endswith('韻母') and trainflags[1]:
                    f.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                    trainflags[1] = False
                elif j.endswith('調值') and trainflags[2]:
                    f.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                    trainflags[1] = False
                else:
                    rd = np.random.randint(100)
                    if rd>40:
                        #f.write(str(i)+'\t'+str(j)+'\t'+str(j)+str(a[j])+'\n')
                        f.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                    elif rd>25:
                        #g.write(str(i)+'\t'+str(j)+'\t'+str(j)+str(a[j])+'\n')
                        g.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                    else:
                        #k.write(str(i)+'\t'+str(j)+'\t'+str(j)+str(a[j])+'\n')
                        k.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
            else:
                v.write(str(i)+'\t'+str(j)+'\t'+'0'+'\n')
                
f.close()
g.close()
k.close()
v.close()

In [None]:
instance_dict = {}
for i in data.index:
    a=data.loc[i]
    for j in a.index:
        if not j.endswith('調類'):
            if not pd.isnull(a[j]) and not a[j]=='nan':
                rd = np.random.randint(100)
                if rd>30:
                    f.write(str(i)+'\t'+str(j)+'_spec\t'+str(j)+str(a[j])+'\n')
                    #f.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                elif rd>20:
                    g.write(str(i)+'\t'+str(j)+'_spec\t'+str(j)+str(a[j])+'\n')
                    #g.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                else:
                    k.write(str(i)+'\t'+str(j)+'_spec\t'+str(j)+str(a[j])+'\n')
                    #k.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                if not (j,a[j]) in instance_dict:
                    instance_dict[(j,a[j])] = str(j)+str(a[j])
                    rd = np.random.randint(100)
                    if rd>20:
                        f.write(str(j)+str(a[j])+'\t'+'is_instance_of'+'\t'+str(a[j])+'\n')
                    elif rd>10:
                        g.write(str(j)+str(a[j])+'\t'+'is_instance_of'+'\t'+str(a[j])+'\n')
                        #g.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')
                    else:
                        k.write(str(j)+str(a[j])+'\t'+'is_instance_of'+'\t'+str(a[j])+'\n')
                        #k.write(str(i)+'\t'+str(j)+'\t'+str(a[j])+'\n')

In [4]:
data = data.set_index('id')
data = data.drop_duplicates()

In [83]:
g = open('./xiangyu/valid2.txt','w',encoding='utf8')
data = data.set_index('id')
data = data.groupby(data.index).first()
for i in data.index:
    a = data.loc[i]
    for j in a.index:
        if pd.isnull(a[j]) or a[j]=='nan' and not j.endswith('調類'):
            rd = np.random.randint(100)
            g.write(str(i)+'\t'+str(j)+'\t'+'0'+'\n')

In [78]:
np.random.seed(21159)
indices = np.arange(5)
test = np.random.randint(10,size=(5,10))
test = np.argsort(test, axis =1)
sheshan = np.argwhere(test < 3)


In [25]:
data[relevant].to_csv('./relevant.csv')

In [20]:
loaded_data = np.load('./ents.txt.npz')
a = np.concatenate((loaded_data['arr_0'],loaded_data['arr_1'],loaded_data['arr_2'],loaded_data['arr_3']),axis=1)
a.shape

(4824, 1024)

In [21]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 32)
l = pca.fit_transform(a)

In [10]:
fo = open(r'./dicts/dict.txt',encoding='utf-8')
entdict = eval(fo.read())
fo.close()

In [14]:
fout = open(r'./embs/pretrained.txt','w',encoding='utf-8')
for i,j in zip(entdict,a):
    fout.write(i+' ')
    for k in j:
        fout.write(str(k)+' ')
    fout.write('\n')

In [24]:
#pca = PCA(n_components=40)
#down = TSNE(n_components=2,init='pca', perplexity=15)
#l = down.fit_transform(pca.fit_transform(a[-290:,:]))
for i,j in zip(entdict,l[:,:4]):
    print(i,end=' ')
    for k in j:
        print(k,end=' ')
    print()

0 0.23287109 1.456553 -0.53762114 -1.1399001 
1 -2.5018992 8.875839 0.5462185 0.8607762 
1000 -3.9848728 -5.5357065 -1.4395751 -1.916523 
10007 3.2280016 1.2041125 5.8267097 -3.7704227 
10008 4.5739837 3.3017797 9.364451 -0.9135145 
10010 5.354742 -1.7020986 1.346762 -0.84667265 
10012 6.7181854 -2.1279886 -3.663465 -1.2985021 
10016 7.1944084 -1.5562593 -0.90122545 -1.2673932 
10017 5.484687 -1.6301577 -2.2267082 -1.466887 
10022 5.738231 -2.1777241 -2.367097 4.4061074 
10028 3.6228766 -0.2426614 8.953494 -4.479244 
10029 1.954189 1.4038333 4.859054 -4.1069794 
10030 -0.1852476 1.1748837 5.840981 -5.8124266 
10044 6.6736526 -1.4931712 -5.235511 -1.7505567 
10045 6.3673587 0.88796467 -4.2427316 -3.7250543 
10055 -2.2316418 -2.6545107 3.1880693 -1.2315352 
10062 3.062463 0.259022 9.109622 -2.5964878 
10067 1.0446647 0.4607236 -1.9975004 -0.03803185 
10069 3.9906573 1.2922502 9.110704 0.2494464 
10079 1.893873 -3.2590854 8.831836 5.2247005 
1008 -4.4054494 -3.8861437 -1.2838869 -2.787046

11964 2.7482026 -0.36193874 -2.1953402 -0.026008677 
11968 5.5762577 -2.6042473 -4.314313 -0.12644722 
11978 3.0613103 -0.084367536 6.739724 0.14413615 
11979 2.6697955 2.6735547 6.8760653 -0.44085243 
11983 2.9576554 -0.5038069 8.368424 -0.14696781 
11984 2.6519046 -0.57169926 7.2635283 -1.6001844 
11990 3.2095225 -2.404266 7.4354854 -0.80276185 
11991 4.4542966 -1.0269908 9.1102085 -3.951855 
11997 2.4599178 2.2640853 8.940715 1.653257 
12002 3.4970918 -1.5605797 1.2636204 2.6876166 
12005 4.8399677 1.8594214 8.906798 1.9313483 
12019 4.4294314 0.6519773 9.588712 -1.7173907 
12020 3.4094903 0.8123228 7.774772 -0.24535328 
12023 2.9652464 -0.42147997 7.680297 5.2794166 
12024 2.7310567 -0.42311805 10.005141 6.290283 
12030 3.544517 0.08287735 -0.70188713 -0.68989867 
12034 5.4509587 2.5464103 9.656626 0.18662304 
12035 2.687118 2.279797 6.3495865 1.2396101 
12046 2.790741 -1.1440738 8.9837 -2.4182656 
12047 3.617186 1.3894682 10.001006 -0.092342734 
12048 3.711429 1.435677 6.340844 0.

13598 2.7616663 -0.5378721 6.3856215 -1.9804474 
13599 3.5028336 -0.588426 8.293469 -2.8180854 
1360 -3.2844865 9.295768 -0.8737137 -4.754217 
13600 5.011448 -0.10470492 -3.1993794 -3.2129402 
13605 1.9368527 -2.1222613 9.896674 6.04564 
13613 4.628155 -0.48527062 9.343688 -1.4004388 
13622 2.5561364 -1.0264714 9.677347 4.8600984 
13625 3.4171224 0.44655803 6.630228 -3.7723234 
13634 6.358829 0.35670778 -4.9389677 2.5191948 
13637 1.6565832 0.71775544 9.378496 -1.196769 
13643 2.559412 -5.417816 -4.3458366 0.91392326 
13644 1.7146907 -4.624531 -1.7311087 0.26208505 
13645 1.6230108 -4.687565 -0.9764782 -0.5473095 
13658 1.3478798 0.52076 9.445706 -3.4162214 
13659 3.750483 -1.6890923 8.622821 -1.1684391 
13665 4.5080624 1.2397577 8.566655 -1.0589068 
13671 5.3519783 -1.715913 -5.499444 1.7372599 
13672 7.702249 -0.14614426 -3.1824985 0.85037106 
13676 1.7823043 -5.334012 -2.5526695 -0.9033242 
13677 2.5994267 -5.671807 -2.071947 -0.77984107 
13678 1.7674652 -5.2474914 -3.3512187 -1.462

15123 5.8652124 -2.4046562 -2.5959566 1.5614711 
15124 6.278256 -1.2817608 -3.5671394 -0.19863515 
15125 5.1054425 -2.9393759 -3.783722 -0.21725145 
1513 1.6303282 5.4335446 4.2008705 -0.45739916 
15139 4.7539215 1.7472718 -3.8855195 1.0556843 
1514 -2.6379807 7.5850587 -1.0596011 1.2836567 
15140 7.218324 -0.5982991 -3.939029 1.4038374 
15141 6.275564 -0.5626265 -2.9950187 2.4981532 
15142 6.8434396 0.59413606 -3.029771 0.97899914 
15145 4.8838763 -0.4941412 -5.7521315 -0.6927531 
15146 0.6478137 -1.8712994 -0.6524764 0.556556 
15158 5.694066 0.84884536 -3.4274259 -2.5675452 
15161 6.0074186 0.18519521 -4.947531 -3.0267763 
15162 4.00003 -0.8324561 -5.0729294 -2.5843592 
15163 3.7717917 -0.24109899 -5.7083015 -0.45615488 
15164 4.9612255 -2.74781 -3.3837829 -1.2533885 
15165 7.073395 -3.7949445 -2.407464 -0.6569672 
15169 6.3055153 -3.1425467 -2.5977013 -0.36874077 
15172 6.6109314 -2.1427095 -3.4098954 -0.92698187 
15173 5.416109 -3.3199944 -3.1877286 0.47225872 
15176 5.515475 -1.25

16942 5.620418 -2.4022377 -5.297383 1.4635712 
16950 6.7307763 -2.4738443 -3.8206253 1.4349359 
16954 7.1580043 -1.5073605 -4.484094 1.0564233 
16955 6.5246854 -1.7320042 -3.4934862 -2.031971 
16965 5.999811 -1.2749556 -3.925994 1.0421995 
16971 6.9213142 -0.92526186 1.1181778 -0.054289997 
1698 -2.709022 -3.862076 -1.0551715 -3.13125 
16980 6.8193173 -0.76151705 -4.671044 0.060300186 
16981 6.4477086 -0.5530693 -4.0304523 1.082503 
16982 6.0680404 -1.9621707 -3.6362107 0.32146242 
17001 5.5751595 -2.876283 -6.4069495 -1.0885888 
17002 6.2845144 -1.2820212 -3.489201 0.23207077 
17003 6.633215 -0.7099287 -1.6501193 -0.104006246 
17016 6.928241 -1.6275036 -4.145934 1.9517843 
17029 6.945196 -0.20815936 1.2083346 0.19631532 
17030 7.0953536 1.7402238 1.7395068 0.5093592 
17032 4.582678 0.14549655 -5.6847296 -0.6266488 
17037 5.600497 -3.5951207 -4.1434507 1.1722972 
1704 -5.051365 -4.7393994 -2.4699044 -3.3761547 
17040 6.8409715 -2.5912428 -2.5062208 0.38790178 
1705 -4.0307446 -3.622761

18480 6.627524 -2.1816864 -2.9612095 -1.2671483 
18482 6.329628 0.926287 -3.385486 -2.061176 
18486 2.3735268 0.9268118 -1.7311165 -0.51761895 
1849 -2.9544215 7.1191974 -1.8266027 -2.964812 
18492 6.147352 -0.05807582 -4.932067 -3.9044914 
18493 5.533102 -1.9996228 -3.8486915 0.05436203 
18498 6.7518673 -0.7865562 -2.522947 -0.2573236 
1850 -3.5051372 6.409158 -1.2801759 -2.1118858 
18501 5.92611 -3.147254 -4.290086 -1.746333 
18506 6.106042 -0.035701394 -1.6453339 -1.3404335 
18507 5.9845095 -2.2555537 -4.142349 -1.7309685 
18508 6.7645617 -2.1070712 -3.8137546 0.20532332 
18515 7.8739576 -0.08212591 -3.3998897 -0.5452623 
18521 -2.715275 8.21865 -2.011528 1.2383424 
18522 7.963775 -1.3600405 -2.479451 0.76913375 
18528 5.8530326 -0.46138215 -3.5907526 -1.9407122 
18529 5.241148 -0.06191613 -3.4803066 -2.1613364 
18534 5.4245157 -1.4535896 -2.3952942 -1.4199826 
18535 5.639424 -1.3541181 -3.1244264 -1.0257366 
18538 5.3624225 -0.54184604 -3.776932 -1.9397784 
1854 -3.0890152 8.081567

19698 7.3050127 -0.250827 0.7099174 -1.4246051 
19701 5.35988 0.3329966 -2.7968085 -0.48784816 
19703 -2.5200942 6.8973665 -1.2110335 0.24872705 
19715 5.612656 -0.32527125 -5.5206437 -2.9815729 
1972 -2.6344292 8.933392 -0.9055489 -3.2506804 
19722 6.637554 -2.8375816 -2.9972942 -1.9231766 
19723 5.3674293 -2.8958304 -3.2728512 -1.623781 
1974 -4.3798356 3.4719443 -2.6372445 -5.3466363 
19747 2.2544649 -0.6341204 5.0928526 -0.93278974 
19751 5.400046 -1.5607138 -1.9905723 6.24874 
19752 6.08232 -1.5441384 -2.719118 0.52901506 
19760 6.050938 -1.0884501 -4.005945 0.25486296 
19761 4.457277 -1.1764272 -3.1194797 -2.152487 
19762 7.193587 1.0979105 -3.3882513 -0.886825 
19765 5.799493 -1.6925427 -3.2515502 1.5996284 
19767 6.400173 -2.8580384 -1.9666297 2.4707737 
19772 0.9964099 0.51083 5.829979 -1.5043632 
19784 4.7165613 0.5983131 -5.3228273 -1.8511362 
19787 5.1810145 -0.74987906 -1.967219 -0.25719398 
19788 -3.226274 5.0941715 1.5612215 6.851387 
19795 3.0781033 5.3042483 -5.634804 

22192 -4.2256074 -0.09753664 -1.1691566 -1.8760874 
22204 -5.9098573 0.7338876 -2.940323 -2.9474742 
22207 -4.25968 0.1732954 -1.0035193 -4.224329 
22211 -3.6620092 -5.777802 -1.6643746 -1.5324435 
22226 -2.3655298 -4.9113398 -1.6664016 4.9369373 
22227 -5.141029 -4.489787 0.54466254 3.3947518 
22234 -5.2324996 2.7386246 -1.0044109 -0.5231432 
22253 -4.821474 -1.4592727 0.46934417 5.288549 
22261 -6.960146 0.8825807 -1.5929768 -3.3207462 
22262 -4.514321 2.1795425 -1.4010324 -0.3070284 
22287 -3.3178308 3.0078201 -0.7829635 -2.8141625 
22297 -5.14347 0.24415088 -1.4469612 0.9658999 
22298 -3.8379924 -0.966659 -1.0104417 -3.145467 
22327 -5.794268 0.38651636 -2.2907271 0.2839705 
22355 -5.381168 0.030752394 -2.599179 0.5298308 
22360 -5.8769217 0.1334795 -1.6449988 0.90785563 
22377 -6.6280217 -0.87841314 -1.3078039 0.10748397 
22397 -5.5676656 -0.09545874 -4.3941493 -3.247173 
22405 -4.4937143 -1.7886255 -0.43442115 -0.758997 
22417 -5.2461386 -0.45006856 -0.76846504 -1.4158356 
2242 -

24622 -4.2154617 -3.3343024 -1.0406294 -2.9147215 
24630 -4.9079394 -7.352575 -0.8915337 4.066516 
24632 -2.9825091 -4.947044 -0.58963925 1.5167905 
24634 -3.6436074 3.9306517 -0.63117176 -0.6906759 
24635 -5.764519 0.45653957 -3.8079536 -0.9996316 
2464 -2.5051088 8.096651 -0.20287205 0.9515151 
2465 -2.889025 6.5040684 1.7405357 -0.018265126 
24650 -5.474428 1.1146445 -3.3481698 -4.533956 
24651 -5.3453016 0.31809497 -2.3386285 -3.5502114 
24657 -4.953247 0.7372996 -2.9387438 -4.1168594 
24667 -4.565683 -4.9443526 0.118079804 -3.6275032 
24669 -1.2771341 -5.1985226 -1.5591649 -1.712669 
2467 -1.7490314 5.7908096 0.089567505 1.12615 
24670 -2.7952971 -6.8767447 -1.458323 -2.5642447 
24676 -5.958792 -0.9071232 -1.6745942 -5.696471 
24678 -4.569124 1.6037594 1.8897052 -2.5418668 
24680 -4.233883 -0.76451397 -1.818859 -3.743531 
24689 -3.337403 1.4725633 -2.9666514 -3.2253058 
24697 -3.1137993 2.182026 -0.94905525 -0.24051404 
24698 -4.9116817 -1.6531049 -1.4937636 -2.339035 
24699 -3.66

3408 -3.3724694 7.948564 0.42723238 -2.0370936 
341 -5.3493447 -4.9943657 -0.89230967 -1.8803434 
3418 -3.2881663 6.7595396 0.17977765 0.18698087 
343 -4.025945 -5.4916787 0.07013657 -1.8396842 
3432 -3.160925 8.815816 -1.5331118 1.5964501 
3438 -1.9489164 8.434423 -1.686773 3.1429698 
3441 -3.6529517 7.7273874 -1.7549804 1.8848407 
3445 -0.66292584 5.862428 -2.1791484 2.4751043 
3447 -3.4071567 4.2599463 -0.3024113 7.473102 
3457 -5.504903 -5.23649 0.67167914 0.5381195 
3469 -4.6627083 -4.3434505 -0.0028623838 4.559733 
3470 -4.5893316 -5.4215674 0.21841003 -1.4072309 
3471 -0.4915533 -5.454529 -3.2388356 -1.5927352 
3472 -5.457289 -5.129508 -2.6003232 -0.36967066 
3474 -3.8693886 -6.0716653 -0.6749507 -1.0733019 
3495 -5.2007446 -5.135826 -1.6939645 -2.6233053 
3496 -3.2011743 -5.3594956 -0.6678224 -0.9201477 
35 0.0999884 0.17098284 -0.3841864 0.38810843 
3509 -2.332496 -3.3294022 -1.8791412 -2.417273 
351 -2.4115477 8.119635 -0.585612 3.65597 
3521 -2.8431382 8.657555 -1.776126 1.6

5307 -1.8838578 6.9967256 -1.170774 -0.54073304 
5309 -4.3818803 -5.953888 -0.8125564 -1.8805047 
5311 -2.6670518 7.4392486 -0.92327553 1.0154053 
5312 -1.1207559 6.9183574 -0.95804054 -0.30586523 
5313 -0.31472483 7.2829785 0.6362194 0.3692718 
5319 -0.72074187 6.14796 -2.81994 -2.3962076 
5323 -4.2808785 -7.0182085 0.912449 4.491384 
5324 -5.6465807 -5.1889157 1.6297542 5.111861 
5332 -6.620494 -5.783222 1.5199709 5.601856 
5334 -2.3479145 -5.568238 0.9920474 4.6495647 
5350 -3.232821 8.633622 -0.033540294 -1.9280955 
536 -2.1503282 6.18122 -0.7600415 6.439448 
5367 -2.232419 6.678776 -1.5568658 1.4335911 
5370 -0.3542607 7.857958 -1.1119529 1.5452808 
5371 3.2829995 1.8042809 -2.8533285 -1.1152654 
5381 -4.766925 -4.4404764 0.61578435 -0.64978373 
5385 -4.8145313 -3.804568 0.6469842 -0.5587361 
5388 -4.66292 -5.526672 -1.1144215 -4.0431013 
5389 -5.0865474 -3.8337336 1.0071741 -4.192522 
5403 -3.979302 -6.233039 0.029206242 -3.6737502 
5404 -5.7426662 -6.7137847 0.3496309 -0.9390982

7056 -2.8776686 8.09967 -1.438209 3.1069906 
7059 -2.934131 7.716552 -1.0733513 3.5891838 
706 -2.2562177 7.8382998 -2.2869499 0.030480262 
7061 -3.7355528 7.834674 -2.2424023 0.17583217 
7063 -2.0207102 7.8055263 -2.9061284 1.8342571 
707 -0.82252306 5.171367 -0.94687635 -0.24667823 
7080 -4.93236 -4.76685 -0.32711053 -0.19347456 
7081 -5.7604656 -4.827806 -0.8093402 -0.50502276 
7087 -3.9534488 7.88319 -1.9251534 -1.7241992 
7089 -4.18612 8.077382 -0.661581 -1.8402199 
7102 -2.583178 7.4385853 -2.6425653 -0.5283336 
7109 -3.798075 7.759038 0.5116613 -0.14129129 
7110 -3.1108801 7.7682915 -0.10148239 0.60340077 
7111 -2.739454 7.1586804 0.1461396 -1.0345806 
7112 -1.855666 7.6607614 -0.2783062 1.0249381 
7114 -3.194588 7.438717 -0.4637932 -2.1842904 
7115 -2.446976 7.421234 -1.0179986 -1.6164479 
7116 -2.7494297 6.325898 0.6755838 -1.6031438 
7117 -3.7787206 7.918824 0.24612957 -0.21002366 
712 -4.4454627 -4.0390635 0.3467132 0.7214424 
7121 -1.4058193 7.318947 -2.6766214 -0.78141695 

8436 -5.296341 -6.0843954 -1.5919266 -2.5984786 
8437 -5.483755 -5.3641534 -0.52031463 -0.291743 
8443 -5.9818754 -6.6322403 1.0236602 0.44395107 
8444 -2.8462372 -6.1429634 1.2143767 1.1415609 
8455 -5.3987255 -6.153221 -0.10107935 4.292623 
8462 -4.54946 -4.066153 -0.83512414 4.653427 
8464 -3.6391773 6.6652565 0.51038116 7.7123475 
8465 -2.062737 6.785251 -0.18536 3.4430468 
8481 -5.8659134 -6.2220693 0.41244534 -1.0562397 
8482 -4.7422495 -4.29853 0.08246689 -1.506386 
849 -0.9995934 -0.57549214 4.6981864 7.1498213 
8491 -5.5376697 -6.497771 0.48738247 -3.5439112 
8492 -4.037568 -5.1042557 2.0276074 -1.023923 
8496 -4.8972735 -6.756177 -1.0750614 0.19987108 
8499 -3.7186959 -4.8657813 2.2053113 -1.393749 
8515 6.2543287 -2.4655714 1.6905411 -0.99220574 
8520 -4.9469366 -4.8211536 0.9310477 -0.4327632 
8525 -2.77218 8.370662 0.5063081 1.290843 
8526 -1.8281845 6.4644966 -1.07164 -0.068878174 
8530 -1.8427818 4.9780416 -0.61768913 -0.9745212 
8542 -4.2952704 -4.259686 -0.4144911 5.16

ue 0.2009344 0.16051404 1.8160989 -1.107823 
uei 1.1699941 -0.123188645 2.6043496 0.1704668 
ueŋ 0.21104443 1.2523956 1.6134317 -1.4714967 
ui 1.3699839 0.31638283 0.18383652 0.04059938 
uĩ 0.956401 -1.2380632 -0.5571544 -1.0337837 
uo -0.10275503 0.6170332 0.6300084 0.3869938 
uãŋ 1.3625522 0.46646878 -0.98832846 0.32242638 
uŋ 1.2746768 0.73440933 2.8750858 -1.8987285 
uə 0.8224441 0.9743149 0.70627725 -2.759959 
uən -0.07264726 0.48766154 2.2425823 0.78967524 
uəŋ 1.5131371 0.105328314 0.13448425 -1.7450302 
uɛi -0.062155984 1.7498821 1.0572684 0.42734483 
uɛ̃ŋ 1.1141499 0.4155503 1.3761996 -0.43612012 
uɤ 0.37112594 0.29425278 0.5825644 -1.5237023 
x -0.17004143 -1.1605552 1.1297507 -2.5181882 
y 1.1324686 0.820005 0.07715039 -1.4865043 
ya 0.18986483 0.072868176 0.61057615 -0.029815625 
yai 1.3724645 -0.044915855 -1.3849504 0.2905738 
yan -0.6229097 0.11625844 0.71791327 -0.32436237 
ye 0.9054838 -0.32998562 1.1598054 1.1216966 
yei 2.1313417 -0.81081444 -0.5266018 0.884081 
yen 

雙峰_韻母əu 0.16411754 0.5973436 -1.4677229 -1.6158922 
雙峰_韻母ɤ 0.12821944 0.7129232 -0.50582045 -0.57973844 
雙峰_韻母ɿ 0.8696912 0.7323678 2.2866971 -0.55942154 
雙峰_韻母ʅ 1.3654827 2.2680805 0.8685258 -1.0837362 
雙峰_韻母ʊ 1.3167863 0.29878256 1.6178765 -0.94427323 


In [101]:
from sklearn.manifold import TSNE
down = TSNE(n_components=2,init='pca', perplexity=8)
pca = PCA(n_components=26)
apca = pca.fit_transform(a)
atsne = down.fit_transform(apca)
atsne



array([[ 752.25946 ,   41.310814],
       [ 719.38617 ,  361.618   ],
       [-545.35614 ,  104.96197 ],
       [-441.31583 ,  127.16826 ],
       [ 501.49265 ,  430.48242 ],
       [ 394.24652 ,  407.13492 ],
       [-445.0959  , -284.49838 ],
       [-378.4402  , -202.9143  ],
       [-248.88493 ,  336.58502 ],
       [-329.33447 ,  407.07532 ],
       [ 434.486   ,  136.509   ],
       [ 528.25793 ,  188.87994 ],
       [ 438.16608 , -599.9476  ],
       [ 535.99365 , -640.9152  ],
       [ 253.21507 , -140.26506 ],
       [ 168.62106 , -208.28795 ],
       [ -62.21913 , -482.24817 ],
       [-162.26393 , -516.39124 ],
       [  30.49551 ,  627.01843 ],
       [ 126.11717 ,  672.09357 ],
       [ -61.371857,  -76.34367 ],
       [-104.38248 ,   21.756086],
       [ 157.93895 ,  154.34192 ],
       [ 134.34679 ,  257.45728 ],
       [ 573.822   , -261.48883 ],
       [ 646.84015 , -184.40965 ]], dtype=float32)

In [86]:
data['杭州_聲母']

id
1        nan
1        nan
2        nan
2        nan
18       nan
        ... 
25487    nan
25496    nan
25517    nan
25520    nan
25521    nan
Name: 杭州_聲母, Length: 8440, dtype: category
Categories (30, object): ['b', 'd', 'dz', 'dʑ', ..., 'ʔl', 'ʔm', 'ʔn', 'ʔȵ']

In [118]:
from keras.preprocessing.text import Tokenizer
sentences = [ 'The cat sat on the mat.','The dog ate my homework.']
tokenizer = Tokenizer(num_words=1080)
# 仅考虑最常见的1000个单词
tokenizer.fit_on_texts(sentences）
#构建词汇表


['全州(縣城)_聲母0',
 '全州(縣城)_聲母b',
 '全州(縣城)_聲母d',
 '全州(縣城)_聲母dz',
 '全州(縣城)_聲母dʑ',
 '全州(縣城)_聲母dʑh',
 '全州(縣城)_聲母f',
 '全州(縣城)_聲母h',
 '全州(縣城)_聲母k',
 '全州(縣城)_聲母kh',
 '全州(縣城)_聲母l',
 '全州(縣城)_聲母m',
 '全州(縣城)_聲母p',
 '全州(縣城)_聲母ph',
 '全州(縣城)_聲母s',
 '全州(縣城)_聲母t',
 '全州(縣城)_聲母th',
 '全州(縣城)_聲母ts',
 '全州(縣城)_聲母tsh',
 '全州(縣城)_聲母tɕ',
 '全州(縣城)_聲母tɕh',
 '全州(縣城)_聲母tʃ',
 '全州(縣城)_聲母tʃh',
 '全州(縣城)_聲母x',
 '全州(縣城)_聲母z',
 '全州(縣城)_聲母ɕ',
 '全州(縣城)_聲母ɡ',
 '全州(縣城)_聲母ʑ',
 '全州(縣城)_調值22',
 '全州(縣城)_調值23',
 '全州(縣城)_調值33',
 '全州(縣城)_調值35',
 '全州(縣城)_調值55',
 '全州(縣城)_韻母a',
 '全州(縣城)_韻母ai',
 '全州(縣城)_韻母au',
 '全州(縣城)_韻母aŋ',
 '全州(縣城)_韻母ẽ',
 '全州(縣城)_韻母i',
 '全州(縣城)_韻母ia',
 '全州(縣城)_韻母iau',
 '全州(縣城)_韻母in',
 '全州(縣城)_韻母io',
 '全州(縣城)_韻母ioŋ',
 '全州(縣城)_韻母iu',
 '全州(縣城)_韻母iãŋ',
 '全州(縣城)_韻母iŋ',
 '全州(縣城)_韻母iɛ',
 '全州(縣城)_韻母iɛ̃',
 '全州(縣城)_韻母o',
 '全州(縣城)_韻母oŋ',
 '全州(縣城)_韻母u',
 '全州(縣城)_韻母ua',
 '全州(縣城)_韻母uai',
 '全州(縣城)_韻母uaŋ',
 '全州(縣城)_韻母uei',
 '全州(縣城)_韻母ui',
 '全州(縣城)_韻母uãŋ',
 '全州(縣城)_韻母uə',
 '全州(縣城)_韻母uɛi',
 '全州(縣城)_韻母uɛ̃ŋ',
 '全州(縣城)_韻母y',
 '全州(縣城)_韻母yen

In [61]:
sentences = []
for i in data.index:
    a = data.loc[i]
    temp_sentence= {}
    for j in a.index:
        if not j.endswith('調類'):
            if j.endswith('聲母'):
                temp_sentence.setdefault(j.strip('_聲母'),[])
                temp_sentence[j.strip('_聲母')].append(j.strip('_聲母')+str(a[j]))
            elif j.endswith('韻母'):
                temp_sentence.setdefault(j.strip('_韻母'),[])
                temp_sentence[j.strip('_韻母')].append(j.strip('_韻母')+str(a[j]))
            elif j.endswith('調值'):
                temp_sentence.setdefault(j.strip('_調值'),[])
                temp_sentence[j.strip('_調值')].append(j.strip('_調值')+str(a[j]))
    sentences.append(temp_sentence)
sentences

[{'長沙': ['長沙t', '長沙ən', '長沙33'],
  '雙峰': ['雙峰t', '雙峰an', '雙峰55'],
  '全州(縣城)': ['全州(縣城)t', '全州(縣城)oŋ', '全州(縣城)33'],
  '灌陽(文市)': ['灌陽(文市)t', '灌陽(文市)uŋ', '灌陽(文市)22']},
 {'長沙': ['長沙nan', '長沙nan', '長沙nan'],
  '雙峰': ['雙峰nan', '雙峰nan', '雙峰nan'],
  '全州(縣城)': ['全州(縣城)t', '全州(縣城)oŋ', '全州(縣城)33'],
  '灌陽(文市)': ['灌陽(文市)t', '灌陽(文市)uŋ', '灌陽(文市)22']},
 {'長沙': ['長沙t', '長沙ən', '長沙13'],
  '雙峰': ['雙峰d', '雙峰an', '雙峰13'],
  '全州(縣城)': ['全州(縣城)d', '全州(縣城)oŋ', '全州(縣城)23'],
  '灌陽(文市)': ['灌陽(文市)d', '灌陽(文市)uŋ', '灌陽(文市)33']},
 {'長沙': ['長沙nan', '長沙nan', '長沙nan'],
  '雙峰': ['雙峰nan', '雙峰nan', '雙峰nan'],
  '全州(縣城)': ['全州(縣城)d', '全州(縣城)oŋ', '全州(縣城)23'],
  '灌陽(文市)': ['灌陽(文市)d', '灌陽(文市)uŋ', '灌陽(文市)33']},
 {'長沙': ['長沙ts', '長沙ən', '長沙33'],
  '雙峰': ['雙峰t', '雙峰an', '雙峰55'],
  '全州(縣城)': ['全州(縣城)ts', '全州(縣城)oŋ', '全州(縣城)33'],
  '灌陽(文市)': ['灌陽(文市)ts', '灌陽(文市)uŋ', '灌陽(文市)22']},
 {'長沙': ['長沙ts', '長沙ən', '長沙33'],
  '雙峰': ['雙峰t', '雙峰an', '雙峰55'],
  '全州(縣城)': ['全州(縣城)nan', '全州(縣城)nan', '全州(縣城)nan'],
  '灌陽(文市)': ['灌陽(文市)nan', '灌陽(文市)nan

In [5]:
"雙峰_調類".strip('_調類')

'雙峰'

In [16]:
joint_prob = data.groupby(['全州(縣城)_韻母', '雙峰_韻母']).size() / 3333
joint_prob

全州(縣城)_韻母  雙峰_韻母
a          a        0.007201
           an       0.000000
           e        0.000000
           i        0.000000
           ia       0.000000
                      ...   
ɿ          əu       0.000000
           ɤ        0.000000
           ɿ        0.010801
           ʅ        0.011701
           ʊ        0.000000
Length: 1548, dtype: float64

In [27]:
def comb_entr(joint_prob):
    entr = 0
    for i in joint_prob:
        entr+=i*-np.log(i)
    return entr

In [17]:
comb_entr(joint_prob)

  entr+=i*-np.log(i)
  entr+=i*-np.log(i)


nan

In [28]:
def mutual_info(data, columns):
    cross_dict = {}
    entr_dict = {}
    mutual_dict = {}
    sum_mutual = {}
    for i in columns:
        marg_prob = data[[i]].dropna().groupby(i).size() / 3333
        cross_dict[i] = marg_prob
        entr_dict[i] = comb_entr(marg_prob)
    for i in columns:
        tsum_mutual = 0
        for j in columns:
            if j is not i:
                joint_prob = data[[i, j]].dropna().groupby([i, j]).size() / 3333
                cross_dict[(i,j)] = joint_prob
                entr_dict[(i,j)] = comb_entr(joint_prob)
                mutual_dict[(i,j)] = - entr_dict[(i,j)] + entr_dict[i] + entr_dict[j]
                tsum_mutual += - entr_dict[(i,j)] + entr_dict[i] + entr_dict[j]
            sum_mutual[i] = tsum_mutual/(len(initials)-1)
        
    return cross_dict, entr_dict, mutual_dict, sum_mutual

In [29]:
mutual_info(data, initials)[3]

  entr+=i*-np.log(i)
  entr+=i*-np.log(i)


{'丹陽後巷童家橋_聲母': nan,
 '靖江_聲母': nan,
 '丹陽_聲母': nan,
 '江陰_聲母': nan,
 '常州_聲母': nan,
 '金壇西崗_聲母': nan,
 '溧陽_聲母': nan,
 '宜興_聲母': nan,
 '常熟_聲母': nan,
 '無錫_聲母': nan,
 '寶山霜草墩_聲母': nan,
 '崑山_聲母': nan,
 '蘇州_聲母': nan,
 '蘇州(當代吳語)_聲母': nan,
 '上海(三林塘)_聲母': nan,
 '溫州_聲母': nan,
 '雲和_聲母': nan,
 '開化_聲母': nan,
 '龍游_聲母': nan,
 '常山_聲母': nan,
 '江山(江山市)_聲母': nan,
 '玉山_聲母': nan,
 '遂昌_聲母': nan,
 '廣豐(永豐鎮)_聲母': nan,
 '浦城(南浦鎮城關話)_聲母': nan,
 '慶元_聲母': nan,
 '宣州市裘公鄉_聲母': nan,
 '貴池市茅坦鄉_聲母': nan,
 '寧國市莊村鄉_聲母': nan,
 '貴池市灌口鄉_聲母': nan,
 '寧國市南極鄉_聲母': nan,
 '黃山區甘棠鎮_聲母': nan,
 '蕪湖縣_聲母': nan,
 '繁昌_聲母': nan,
 '銅陵縣_聲母': nan,
 '南陵(仙坊話)_聲母': nan,
 '涇縣_聲母': nan,
 '石台_聲母': nan,
 '寶山羅店_聲母': nan,
 '上海_聲母': nan,
 '南匯周浦_聲母': nan,
 '松江_聲母': nan,
 '吳江黎里_聲母': nan,
 '吳江盛澤_聲母': nan,
 '嘉興_聲母': nan,
 '湖州雙林_聲母': nan,
 '杭州_聲母': nan,
 '餘姚_聲母': nan,
 '紹興_聲母': nan,
 '諸暨王家井_聲母': nan,
 '嵊縣崇仁_聲母': nan,
 '嵊縣太平_聲母': nan,
 '寧波_聲母': nan,
 '黃岩_聲母': nan,
 '金華_聲母': nan,
 '永康_聲母': nan,
 '衢州_聲母': nan}

In [63]:
data[['長沙_聲母','長沙_聲母']].dropna().groupby(['長沙_聲母']).size()

ValueError: Grouper for '長沙_聲母' not 1-dimensional

In [49]:
data

Unnamed: 0_level_0,長沙_聲母,長沙_韻母,長沙_調值,長沙_調類,雙峰_聲母,雙峰_韻母,雙峰_調值,雙峰_調類,全州(縣城)_聲母,全州(縣城)_韻母,全州(縣城)_調值,全州(縣城)_調類,灌陽(文市)_聲母,灌陽(文市)_韻母,灌陽(文市)_調值,灌陽(文市)_調類
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,t,ən,33,陰平,t,an,55,陰平,t,oŋ,33,陰平,t,uŋ,22,陰平
2,,,,,,,,,t,oŋ,33,陰平,t,uŋ,22,陰平
18,t,ən,13,陽平,d,an,13,陽平,d,oŋ,23,陽平,d,uŋ,33,陽平
19,,,,,,,,,d,oŋ,23,陽平,d,uŋ,33,陽平
63,ts,ən,33,陰平,t,an,55,陰平,ts,oŋ,33,陰平,ts,uŋ,22,陰平
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25487,tɕh,ie,24,入,k,e,13,陽平,tɕ,iɛ,33,陰平,tsh,ie,33,陽平
25496,,,,,,,,,,,,,0,ie,33,陽平
25517,,f,,,,,,ua,0,ua,23,陽平,x,ua,33,陽平
25520,f,a,24,入,x,ua,13,陽平,x,ua,33,陰平,x,ua,33,陽平


In [9]:
np.save('./new/xiangyu_pre',data)

In [21]:
testc = np.array(data['全州(縣城)_韻母'])

ValueError: Cannot setitem on a Categorical with a new category, set the categories first

In [193]:
lbs = []
#rep = np.array([[] for _ in range(3333)])
rep = []
meaning = data.columns[[0,4,8,12,1,5,9,13,2,6,10,14]]
#meaning = data.columns[[2,6,11,14]]
for i in range(12):
    lb = LabelBinarizer()
    print(meaning[i])
    #items = list(set(data[meaning[i]]))
    #tempd = {}
    #for q in items:
     #   if q=='nan' or q==np.nan:
      #      tempd[q]=0
    testc = np.array(data[meaning[i]],dtype='object')
    for i in range(testc.shape[0]):
        testc[i] = str(testc[i])+'e'
    testct = lb.fit_transform(testc)
    bad_ind = np.argmax(lb.transform(['nan']))
    lbs.append(lb)
    processed = np.where(np.tile((testc!='nan')[:,None],reps=(1,testct.shape[1])),testct, np.zeros_like(testct))
    processed = np.concatenate([processed[:,:bad_ind],processed[:,bad_ind+1:]],axis=1)
    rep.append(processed)
    #rep = np.where(np.tile((testc=='nan')[:,None],reps=(1,testct.shape[1])),testct,np.zeros_like(testct))
    #rep = np.concatenate([rep, np.where(np.tile((testc=='nas')[:,None],reps=(1,testct.shape[1])),testct,np.zeros_like(testct))],axis=1)
    


長沙_聲母
雙峰_聲母
全州(縣城)_聲母
灌陽(文市)_聲母
長沙_韻母
雙峰_韻母
全州(縣城)_韻母
灌陽(文市)_韻母
長沙_調值
雙峰_調值
全州(縣城)_調值
灌陽(文市)_調值


In [115]:
data['長沙_調值']

id
1         33
2        NaN
18        13
19       NaN
63        33
        ... 
25487     24
25496    NaN
25517    NaN
25520     24
25521    NaN
Name: 長沙_調值, Length: 3333, dtype: category
Categories (11, object): ['0', '13', '21', '23', ..., '35', '4', '41', '45']

In [48]:
data = data.set_index('id')
data = data.drop_duplicates()

In [124]:
phlist = []
phlist_pure = []
for i in relevant:
    local_phlist = list(set(data[i]))
    print(local_phlist)
    if 'nan' in local_phlist:
        local_phlist.remove('nan')
    if np.nan in local_phlist:
        local_phlist.remove(np.nan)
    for j in local_phlist:
        phlist.append(i+'_'+j)
        phlist_pure.append(j)

['ɕ', '0', 'ts', 'tsh', 's', 'ȵ', 'ph', 'x', 'l', 'z', 'tɕh', 't', 'a', 'p', 'ŋ', 'nan', 'm', 'th', 'tɕ', 'k', 'f', 'kh']
['ɕ', '0', 'tʂh', 'b', 'ts', 'd', 'tsh', 'dʐ', 's', 'ȵ', 'ɣ', 'ph', 'ɡ', 'x', 'l', 'ʂ', 'tɕh', 't', 'p', 'ŋ', 'tʂ', 'dʑ', 'dz', 'nan', 'm', 'th', 'tɕ', 'k', 'kh']
['ɕ', 'dʑh', '0', 'ts', 'b', 'd', 'tsh', 's', 'ph', 'ɡ', 'x', 'l', 'z', 'tɕh', 'tʃ', 't', 'ʑ', 'p', 'tʃh', 'h', 'dʑ', 'dz', 'nan', 'm', 'th', 'tɕ', 'k', 'f', 'kh']
['0', 'ts', 'b', 'd', 'tsh', 's', 'n', 'ph', 'ɡ', 'x', 'l', 'z', 'tʃ', 't', 'p', 'h', 'dz', 'ɬ', 'nan', 'm', 'th', 'tɕ', 'k', 'ʃ', 'ʔ', 'f', 'kh']
['ai', 'ɤ', 'y', 'əu', 'e', 'uai', 'iau', 'yẽ', 'io', 'iẽ', 'ɿ', 'yei', 'yai', 'ian', 'ya', 'yan', 'an', 'iəu', 'u', 'au', 'a', 'ən', 'õ', 'ia', 'i', 'uən', 'o', 'in', 'ua', 'nan', 'ɤ̃', 'uei', 'uɤ', 'yn', 'ye', 'ie', 'f', 'uan', 'ei']
['y', 'iɤ', 'ɤ', 'əu', 'e', 'uĩ', 'iɪ', 'ue', 'ʅ', 'ĩ', 'io', 'æ̃', 'n', 'ɿ', 'ui', 'yɛn', 'ya', 'an', 'iɛn', 'u', 'iĩ', 'iʊ', 'a', 'ia', 'i', 'o', 'iɒŋ', 'ua', '

In [101]:
test = list(set(data[relevant[0]]))

In [105]:
test.remove('nan')

In [126]:
set(phlist_pure)

{'0',
 '13',
 '21',
 '213',
 '22',
 '23',
 '24',
 '3',
 '31',
 '33',
 '35',
 '4',
 '41',
 '42',
 '45',
 '5',
 '55',
 'a',
 'ai',
 'an',
 'au',
 'aŋ',
 'b',
 'd',
 'dz',
 'dʐ',
 'dʑ',
 'dʑh',
 'e',
 'ei',
 'eŋ',
 'ẽ',
 'f',
 'h',
 'i',
 'ia',
 'ian',
 'iau',
 'iaŋ',
 'ie',
 'ieŋ',
 'ieʔ',
 'iẽ',
 'iĩ',
 'in',
 'io',
 'ioŋ',
 'iu',
 'iuŋ',
 'iãŋ',
 'iŋ',
 'iɒŋ',
 'iəu',
 'iəŋ',
 'iɛ',
 'iɛn',
 'iɛ̃',
 'iɤ',
 'iɪ',
 'iɪ̃',
 'iʊ',
 'ĩ',
 'k',
 'kh',
 'l',
 'm',
 'n',
 'o',
 'oŋ',
 'õ',
 'p',
 'ph',
 's',
 't',
 'th',
 'ts',
 'tsh',
 'tɕ',
 'tɕh',
 'tʂ',
 'tʂh',
 'tʃ',
 'tʃh',
 'u',
 'ua',
 'uai',
 'uan',
 'uaŋ',
 'ue',
 'uei',
 'ueŋ',
 'ui',
 'uĩ',
 'uo',
 'uãŋ',
 'uŋ',
 'uə',
 'uən',
 'uəŋ',
 'uɛi',
 'uɛ̃ŋ',
 'uɤ',
 'x',
 'y',
 'ya',
 'yai',
 'yan',
 'ye',
 'yei',
 'yen',
 'yeŋ',
 'yẽ',
 'yiŋ',
 'yn',
 'yo',
 'yɛ',
 'yɛn',
 'yɛŋ',
 'yɛ̃',
 'z',
 'ãi',
 'ãn',
 'ãŋ',
 'æ̃',
 'ŋ',
 'ȵ',
 'ɒŋ',
 'ɕ',
 'ə',
 'ən',
 'əu',
 'əŋ',
 'ɛi',
 'ɛ̃ŋ',
 'ɡ',
 'ɣ',
 'ɤ',
 'ɤ̃',
 'ɬ',
 'ɿ',
 'ʂ',
 

In [117]:
zg = recon.load('./data/middle_chinese.csv')

In [120]:
set(zg['上字'])

{'一',
 '丁',
 '七',
 '七 （士 ）',
 '丈',
 '下',
 '丑',
 '且',
 '丕',
 '丘',
 '中',
 '主',
 '乃',
 '之',
 '乎',
 '乖',
 '乘',
 '乙',
 '九',
 '乞 （苦 ）',
 '予',
 '于',
 '云',
 '五',
 '亡',
 '人',
 '仄',
 '仍',
 '仕',
 '他',
 '以',
 '伊',
 '休',
 '伯',
 '似',
 '佇',
 '何',
 '余',
 '作',
 '佳',
 '來',
 '依',
 '侯',
 '便',
 '俄',
 '俟',
 '俱',
 '倉',
 '借',
 '側',
 '側 （職 ）',
 '傍',
 '傷',
 '傾',
 '儒',
 '兄',
 '充',
 '先',
 '兒',
 '內',
 '公',
 '兵',
 '其',
 '具',
 '兼',
 '冬',
 '几',
 '分',
 '初',
 '則',
 '前',
 '創',
 '力',
 '助',
 '勒',
 '北',
 '匠',
 '匹',
 '匹 （丘 ）',
 '區',
 '千',
 '卑',
 '卓',
 '博',
 '占',
 '危',
 '即',
 '卿',
 '去',
 '叉',
 '叉 （尺 ）',
 '取',
 '口',
 '古',
 '可 （丘 ）',
 '台',
 '叱',
 '史',
 '司',
 '各',
 '吉',
 '同',
 '吐',
 '吾',
 '呂',
 '呵',
 '呼',
 '呼 （乎 ）',
 '咎',
 '哀',
 '唐',
 '商',
 '喜',
 '嘗',
 '土',
 '在',
 '堂',
 '場',
 '墜',
 '墟',
 '士',
 '士 （七 ）',
 '壯',
 '夕',
 '多',
 '大',
 '天',
 '央',
 '失',
 '夷',
 '奇',
 '女',
 '奴',
 '如',
 '妃',
 '妳',
 '姊',
 '始',
 '姑',
 '委',
 '婢',
 '子',
 '子 （千 ）',
 '孚',
 '宅',
 '安',
 '宜',
 '客',
 '寔',
 '實',
 '寫',
 '寺',
 '封',
 '將',
 '尺',
 '尼',
 '居',
 '山',
 '峯',
 

In [121]:
zg

Unnamed: 0_level_0,字形,攝,聲調,韻目,字母,開合,等第,清濁,上字,下字
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,東,通,平,東,端,合,一,全清,德,紅
2,菄,通,平,東,端,合,一,全清,德,紅
3,鶇,通,平,東,端,合,一,全清,德,紅
4,,通,平,東,端,合,一,全清,德,紅
5,,通,平,東,端,合,一,全清,德,紅
...,...,...,...,...,...,...,...,...,...,...
25524,,咸,入,乏,溪,合,三,次清,起,法
25525,,咸,入,乏,娘,合,三,次濁,女,法
25526,,咸,入,乏,娘,合,三,次濁,女,法
25527,,咸,入,乏,娘,合,三,次濁,女,法


In [188]:
np.array(rep,dtype='object')

ValueError: could not broadcast input array from shape (3333,21) into shape (3333,)

In [174]:
rep[-1][4]

array([0, 0, 0, 0])

In [182]:
np.concatenate(rep,axis=1).shape

(3333, 292)

In [192]:
np.tile(np.tile(np.arange(3),reps=(2,1)),reps=(2,1))

array([[0, 1, 2],
       [0, 1, 2],
       [0, 1, 2],
       [0, 1, 2]])

In [199]:
import joblib as jl

In [200]:
jl.dump(rep,'xy_encode')

['xy_encode']