In [69]:
import re
import random
import sys
import copy
import itertools
from queue import PriorityQueue

In [14]:
def preprocess(path, postfix, seed, eps):
    passwd = []
    exp = re.compile(r'[^\x00-\x7f]')
    try:
        with open(path, encoding='utf-8') as fp:
            for line in fp:
                line = line.strip()
                if exp.search(line) or (' ' in line):
                    continue
                passwd.append(line)
    except FileNotFoundError:
        print("File does not exist", file=sys.stderr)

    print("数据加载成功")
    # 切分数据集（训练集和测试集）
    random.seed(seed)
    random.shuffle(passwd)
    split_point = int(len(passwd) * (1-eps))
    trainword = passwd[:split_point]
    testword = passwd[split_point:]
    
    with open("trainword_{}.txt".format(postfix), "w") as f:
        for pd in trainword: f.write(pd + '\n')
    print("训练集生成成功")
    with open("testword_{}.txt".format(postfix), "w") as f:
        for pd in testword: f.write(pd + '\n')
    print("测试集生成成功")

In [15]:
preprocess("./csdn.txt", "csdn", 0, 0.3)    #原始数据预处理预处理

数据加载成功
训练集生成成功
测试集生成成功


In [16]:
preprocess("./yahoo.txt", "yahoo", 0, 0.3)    #原始数据预处理预处理

数据加载成功
训练集生成成功
测试集生成成功


In [11]:
# 读取数据集
def loadpass(path):
    passwd = []
    with open(path, encoding='utf-8', errors='ignore') as wordList:
        for line in wordList:
            passwd.append(line.strip())
    return passwd

In [20]:
# 统计每种结构以及字段出现的次数
def count(part, m, a):
    if m in a:
        if part in a[m]: a[m][part] += 1
        else: a[m].setdefault(part, 1)
    else:
        a.setdefault(m, {})
        a[m].setdefault(part, 1)

In [18]:
def statistic(trainword):
    mode, alpha, digit, special = {}, {}, {}, {}
    pattern = re.compile(r'[A-Za-z]*|[0-9]*|[^a-zA-Z0-9]*', re.ASCII)
    for pd in trainword:
        s = ''          # 当前口令的 pattern
        parts = re.findall(pattern, pd)
        for part in parts:
            if part == '': continue
            else: # 检查 pattern
                l = len(part)
                if part.isdigit():
                    m = 'D'+str(l)
                    count(part, m, digit)
                elif part.isalpha():
                    m = 'L'+str(l)
                    count(part, m, alpha)
                else:
                    m = 'S'+str(l)
                    count(part, m, special)
                s += m
        if s in mode: mode[s] += 1
        else: mode.setdefault(s, 1)
    return mode, alpha, digit, special

In [27]:
def alphatodict(alpha):
    d = {}
    with open('wordlist.txt', 'w') as f:
        for key, value in alpha.items():
            for k in list(value.keys()):
                f.write(k+'\n')

In [28]:
def base_probability(d):
    num = sum(d.values())
    for key in d.keys():
        d[key] = d[key] * 1.0 / num

In [63]:
# 将次数转化为频率
def probability(d):
    for key in d.keys():
        num = sum(d[key].values())
        for k in d[key].keys():
            d[key][k] = d[key][k]*1.0 / num

In [34]:
# 导入测试集
def load_testword(path):
    testword = {}
    with open(path, encoding='utf-8', errors='ignore') as wordList:
        for line in wordList:
            word = line.strip()
            if word in testword: testword[word] += 1
            else: testword.setdefault(word, 1)
    return testword

In [61]:
def save_anlysis_result(fpath, freqs, threshold = 0):
    with open(fpath, "w") as f:
        for freq in freqs:
            if freq[1] > threshold:
                f.write("{} {:.5f}\n".format(freq[0], freq[1]))

In [70]:
def parsebase(base):
    # 'L6D1' --> ['L6','D1']
    pa = re.compile(r'[LDS]\d+', re.ASCII)
    baseList = re.findall(pa, base)
    return baseList

# 分析 CSDN 数据集

In [17]:
trainword_csdn = loadpass('trainword_csdn.txt')    # 导入训练集

In [43]:
oribase_csdn, orialpha_csdn, oridigit_csdn, orispecial_csdn = statistic(trainword_csdn)

In [65]:
alphatodict(orialpha_csdn)   # 字典由训练集中提取的英文字母字段生成，生成wordlist.txt

In [None]:
orialpha_csdn

In [44]:
base_probability(oribase_csdn)   # 每种口令结构出现的频率

In [45]:
oribase_csdn = sorted(oribase_csdn.items(), key=lambda t: t[1], reverse=True)

In [66]:
orialpha_csdn

{'L2': {'xy': 2064,
  'qw': 3098,
  'yk': 417,
  'wq': 1663,
  'th': 415,
  'DO': 20,
  'do': 97,
  'en': 190,
  'jy': 1201,
  'hu': 1856,
  'bd': 211,
  'zz': 3712,
  'kb': 166,
  'zy': 3246,
  'hr': 257,
  'ch': 1391,
  'yu': 3329,
  'li': 6788,
  'ok': 2405,
  'xw': 611,
  'lg': 851,
  'Cc': 42,
  'zt': 797,
  'cs': 1996,
  'mt': 221,
  'qq': 25653,
  'bu': 124,
  'hy': 1616,
  'xf': 943,
  'px': 201,
  'zj': 2427,
  'wu': 2274,
  'cw': 531,
  'zc': 1497,
  'he': 1331,
  'ZK': 45,
  'zx': 3853,
  'cp': 374,
  'aa': 16246,
  'xu': 2173,
  'sx': 645,
  'ly': 3497,
  'kl': 522,
  'Aa': 616,
  'QQ': 1992,
  'll': 2766,
  'ab': 3977,
  'az': 859,
  'yy': 3914,
  'Yy': 50,
  'lj': 2301,
  'yq': 511,
  'qe': 72,
  'ck': 580,
  'ws': 1635,
  'FL': 31,
  'jw': 377,
  'jh': 673,
  'sm': 387,
  'jp': 302,
  'lz': 1149,
  'hm': 383,
  'kk': 1501,
  'ZH': 188,
  'dv': 61,
  'zb': 1020,
  'cb': 599,
  'yp': 344,
  'sz': 949,
  'cn': 1694,
  'wx': 1319,
  'om': 34,
  'sy': 1269,
  'le': 304,
  'lq

In [67]:
probability(orialpha_csdn)
probability(oridigit_csdn)
probability(orispecial_csdn)

# 字母字段排序
for key, value in orialpha_csdn.items():
    orialpha_csdn[key] = sorted(value.items(),key=lambda t:t[1], reverse=True)
# 数值字段排序
for key, value in oridigit_csdn.items():
    oridigit_csdn[key] = sorted(value.items(),key=lambda t:t[1], reverse=True)

# 特殊符号字段排序
for key, value in orispecial_csdn.items():
    orispecial_csdn[key] = sorted(value.items(),key=lambda t:t[1], reverse=True)

In [119]:
alphastats, digitstats, symbolstats = {}, {}, {}

# 字母字段计数
for key, value in orialpha_csdn.items():
    alphastats[key] = len(orialpha_csdn[key])

# 数值字段计数
for key, value in oridigit_csdn.items():
    digitstats[key] = len(oridigit_csdn[key])

# 特殊符号计数
for key, value in orispecial_csdn.items():
    symbolstats[key] = len(orispecial_csdn[key])

In [62]:
save_anlysis_result("./pattern_csdn.txt", oribase_csdn, 0.0001)

In [68]:
orialpha_csdn

{'L2': [('qq', 0.06542346859403173),
  ('aa', 0.041432568151040405),
  ('li', 0.017311601170088776),
  ('as', 0.012937795040639417),
  ('ab', 0.010142639636629798),
  ('yy', 0.009981969207384719),
  ('zx', 0.009826399426687104),
  ('zz', 0.009466803704090976),
  ('ly', 0.008918483985238724),
  ('yu', 0.008490029507251848),
  ('zy', 0.00827835259253214),
  ('qw', 0.007900904600019892),
  ('mm', 0.007582114065803467),
  ('ww', 0.0073857390967261485),
  ('cc', 0.007120505372258083),
  ('ll', 0.007054196941141066),
  ('xx', 0.006508427546562545),
  ('wo', 0.006301851280390302),
  ('zj', 0.00618963701234612),
  ('ok', 0.0061335298783240285),
  ('zl', 0.005911651666509397),
  ('lj', 0.005868296153855963),
  ('wu', 0.005799437398465215),
  ('zh', 0.005674471509052376),
  ('wy', 0.005557156592460731),
  ('xu', 0.005541854646818343),
  ('ss', 0.005429640378774161),
  ('wj', 0.005335278380646099),
  ('xy', 0.00526386930098162),
  ('cs', 0.005090447250367884),
  ('QQ', 0.0050802459532729586),
  (

In [74]:
oridigit_csdn['D5'][0][0]

'12345'

In [75]:
oridigit_csdn['D5'][0][1]

0.11413416086313283

In [78]:
Lp = re.compile('L')
Dp = re.compile('D')
Sp = re.compile('S')

In [125]:
class QItem:
    def __init__(self, prob, data):
        self.prob = prob
        # self.index = data['index']
        self.possible_pwd = data['possible_pwd']
        self.subpatterns = data['subpatterns']
        self.subindexs = data['subindexs']
        self.subvalues = data['subvalues']

    def __lt__(self, other):
        return self.prob > other.prob

In [145]:
# init queue
queue = PriorityQueue()
gen_count = 0
gen_prob = 1 # 极大值 (value<1)
gen_filter_prob = 1e-8
gen_count_max = 1000000
vis = {}
for base in oribase_csdn:         # 遍历所有模式
    if base[1] < 0.0001: continue # 过滤概率太低的模式

    prob = base[1]
    subpatterns = parsebase(base[0])
    subvalues = [None] * len(subpatterns)
    possible_pwd = ""
    
    for i, s in enumerate(subpatterns): # 遍历子模式
        if Lp.match(s): # Letter  subpattern_key
            possible_pwd += orialpha_csdn[s][0][0]
            subvalues[i] = orialpha_csdn[s][0][0]
            prob *= orialpha_csdn[s][0][1]
        elif Dp.match(s): # Digit
            possible_pwd += oridigit_csdn[s][0][0]
            subvalues[i] = oridigit_csdn[s][0][0]
            prob *= oridigit_csdn[s][0][1]
        elif Sp.match(s): # Symbol
            possible_pwd += orispecial_csdn[s][0][0]
            subvalues[i] = orispecial_csdn[s][0][0]
            prob *= orispecial_csdn[s][0][1]
        else:
            print("Error")
            sys.exit(1)

    if prob < gen_filter_prob: continue # 不进入队列
    qitem = QItem(prob=prob,data={
            # "index": 0,                            # qobject[2]
            "possible_pwd": possible_pwd,          # qobject[3]
            "subpatterns": subpatterns,            # qobject[1]
            "subindexs": [0]* len(subpatterns),    # qobject[4]      
            "subvalues": subvalues                 # qobject[5]     
        }
    )
    if vis.get(qitem.possible_pwd, False): continue
    vis[qitem.possible_pwd] = True
    queue.put(qitem)

In [146]:
gen_count = 0
with open('dict.txt', 'w') as fp:
    while not queue.empty():
        qitem = queue.get()
        gen_count += 1
        fp.write("{} {:.9f}\n".format(qitem.possible_pwd, qitem.prob))
        if gen_count > gen_count_max:
            while not queue.empty(): queue.get()
            print("已达到生成数量")
            break

        # qidx = qitem.index
        qsubpatterns = qitem.subpatterns
        # 把每个子 pattern 的下一个模式都添加进队列
        for i, s in enumerate(qsubpatterns):
            # if i < qidx: continue  # 第几个 pattern
            pattern_flag = 0
            if Lp.match(s): pattern_flag = 1
            elif Dp.match(s): pattern_flag = 2
            elif Sp.match(s): pattern_flag = 3
            else:
                print("Error")
                sys.exit(1)

            new_qitem = copy.deepcopy(qitem)
            new_qitem.subindexs[i] += 1

            if pattern_flag == 1: # letter pattern
                # 检查当前 pattern 是否遍历完成
                if new_qitem.subindexs[i] == alphastats[s]: continue
                # 添加下一个可能性的元素
                original = orialpha_csdn[s][qitem.subindexs[i]]
                new = orialpha_csdn[s][new_qitem.subindexs[i]]
                new_qitem.subvalues[i] = new[0]
            elif pattern_flag == 2: # digit pattern
                if new_qitem.subindexs[i] == digitstats[s]: continue
                
                original = oridigit_csdn[s][qitem.subindexs[i]]
                new = oridigit_csdn[s][new_qitem.subindexs[i]]
                new_qitem.subvalues[i] = new[0]
            else: # 3 symbol pattern
                if new_qitem.subindexs[i] == symbolstats[s]: continue
                
                original = orispecial_csdn[s][qitem.subindexs[i]]
                new = orispecial_csdn[s][new_qitem.subindexs[i]]
                new_qitem.subvalues[i] = new[0]
                
            
            # # 判断同类型字段是否遍历完
            # if Lp.match(s) and qitem.subindexs[i] == alphastats[s]:     
            #     continue
            # if Dp.match(s) and qitem.subindexs[i] == digitstats[s]:     
            #     continue
            # if Sp.match(s) and qitem.subindexs[i] == symbolstats[s]:
            #     continue

            # new_qitem = copy.deepcopy(qitem)
            # new_qitem.index = i
            # new_qitem.subindexs[i] += 1
            
            # if Lp.match(s):
            #     original = orialpha_csdn[s][qitem.subindexs[i]]
            #     new = orialpha_csdn[s][new_qitem.subindexs[i]]
            #     new_qitem.subvalues[i] = new[0]
            # elif Dp.match(s):
            #     original = oridigit_csdn[s][qitem.subindexs[i]]
            #     new = oridigit_csdn[s][new_qitem.subindexs[i]]
            #     new_qitem.subvalues[i] = new[0]
            # elif Sp.match(s):
            #     original = orispecial_csdn[s][qitem.subindexs[i]]
            #     new = orispecial_csdn[s][new_qitem.subindexs[i]]
            #     new_qitem.subvalues[i] = new[0]
            # else:
            #     print("Error")
            #     sys.exit(1)
                
            new_qitem.prob = qitem.prob / original[1] * new[1]
            new_qitem.possible_pwd = ''.join(new_qitem.subvalues)
            if new_qitem.prob < gen_filter_prob: continue
            
            if gen_count % 100 == 0: print("process:", gen_count)
            if vis.get(new_qitem.possible_pwd, False): continue
            vis[new_qitem.possible_pwd] = True
            queue.put(new_qitem)
print("队列为空或者结束")

process: 100
process: 200
process: 300
process: 400
process: 500
process: 600
process: 700
process: 800
process: 900
process: 1000
process: 1100
process: 1200
process: 1300
process: 1400
process: 1500
process: 1600
process: 1700
process: 1700
process: 1800
process: 1900
process: 2000
process: 2100
process: 2200
process: 2300
process: 2400
process: 2500
process: 2600
process: 2700
process: 2800
process: 2900
process: 3000
process: 3100
process: 3200
process: 3300
process: 3400
process: 3500
process: 3600
process: 3700
process: 3800
process: 3800
process: 3900
process: 4000
process: 4100
process: 4200
process: 4300
process: 4400
process: 4500
process: 4600
process: 4700
process: 4800
process: 4900
process: 5000
process: 5100
process: 5200
process: 5300
process: 5400
process: 5500
process: 5600
process: 5700
process: 5800
process: 5900
process: 6000
process: 6100
process: 6200
process: 6300
process: 6400
process: 6500
process: 6600
process: 6700
process: 6800
process: 6900
process: 7000
p