# パスワードの強さを確認しよう

まずは、データセットを読み込んで、確認しましょう。

In [1]:
# 生のデータセットを読み込む
def load_raw_dataset():

    import csv
    
    raw_dataset = []

    csv_file = open("data/password_strength.csv", newline='', encoding="utf-8_sig")
    reader = csv.reader(csv_file)

    # csv ファイルの各行を確認
    for line in reader:
         
        password = line[0]
        strength = line[1]
        
        # 強さが数値じゃなければ、使わない
        if not strength.isnumeric():
            continue
    
        raw_dataset.append((password, int(strength)))

    return raw_dataset

In [2]:
# 読み込んで、最初の20件を表示
raw_dataset = load_raw_dataset()
print(raw_dataset[0:20])

[('nahpets', 0), ('sasa2727', 1), ('lilboost', 2), ('4408tiiyt', 3), ('schuessler', 3), ('1033704045', 3), ('smurfy23', 2), ('milagrsy gernys', 4), ('7h0mp50n', 0), ('wutsuphomeboy', 3), ('TERESA', 0), ('hoddboty1', 3), ('patty', 0), ('lucky038', 1), ('Tummykiwi151', 4), ('pipsonly1', 3), ('akamaru16', 3), ('45192480093', 3), ('smurf26', 2), ('25761838', 2)]


## 前処理

パスワード自体をネットワークの入力に変換できるため、前処理をしなければならない。

今回は、任意の文字列を６つのパラメータ（数値）に変換し、入力データにする。６つのパラメータは
* パスワードの長さ（最大20文字）
* 小文字の数
* 大文字の数
* 数字の数
* 記号の有り無し（１か０）
* 重複している文字の数

In [3]:
# 各種の文字を数える
def count_letters(password):
    
    count_length = 0
    count_small  = 0
    count_large  = 0
    count_number = 0
    count_symbol = 0   
        
    for ch in password:
        
        count_length += 1
        
        if ch.isdigit():
            count_number += 1
        elif ch.islower():
            count_small  += 1
        elif ch.isupper():
            count_large  += 1
        else:
            count_symbol += 1
            
    return count_length, count_small, count_large, count_number, count_symbol

In [4]:
count = count_letters("1234abCD!")
print(count)

(9, 2, 2, 4, 1)


In [5]:
# 重複の文字を数える
def count_repeated(password):

    count = {}
    max_repeat = 0
    
    for ch in password:

        if ch in count:
            count[ch] += 1
        else:
            count[ch] = 1
        
        max_repeat = max(max_repeat, count[ch])

    return max_repeat    

In [6]:
count = count_repeated("1234abCD!")
print(count)

count = count_repeated("aaabbbccccc")
print(count)

1
5


In [7]:
# 文字列を数値化する
def password_to_input(password):
    
    count = count_letters(password)
    repeat = count_repeated(password)
    max_length = 20
    
    x = [0, 0, 0, 0, 0, 0]

    x[0] = min(count[0], max_length) / max_length
    x[1] = min(count[1], max_length) / max_length
    x[2] = min(count[2], max_length) / max_length
    x[3] = min(count[3], max_length) / max_length
    x[4] = min(count[4], 1)
    x[5] = min(repeat, max_length) / max_length
    
    return x

In [8]:
test = "1234abCD!"
print (count_letters(test))
print (password_to_input(test))

(9, 2, 2, 4, 1)
[0.45, 0.1, 0.1, 0.2, 1, 0.05]


In [9]:
test = "1234abCDDDD!!!!"
print (count_letters(test))
print (password_to_input(test))

(15, 2, 5, 4, 4)
[0.75, 0.1, 0.25, 0.2, 1, 0.2]


In [10]:
# ラベルを「one-hot」に変換
def strength_to_onehot(strength):
    
    y = [0.0, 0.0, 0.0, 0.0, 0.0]
    y[strength] = 1.0
    
    return y

In [11]:
import numpy as np

def prepare_dataset(raw_dataset):
    
    data_x = []
    data_y = []
    
    for line in raw_dataset:
        
        password = line[0]
        strength = line[1]

        x = password_to_input(password)
        y = strength_to_onehot(strength)
    
        data_x.append(x)
        data_y.append(y)

    data_x = np.array(data_x)
    data_y = np.array(data_y)
    
    return data_x, data_y

data_x, data_y = prepare_dataset(raw_dataset)

In [12]:
print(data_x[0:20])
print(data_y[0:20])

[[0.35 0.35 0.   0.   0.   0.05]
 [0.4  0.2  0.   0.2  0.   0.1 ]
 [0.4  0.4  0.   0.   0.   0.1 ]
 [0.45 0.25 0.   0.2  0.   0.1 ]
 [0.5  0.5  0.   0.   0.   0.15]
 [0.5  0.   0.   0.5  0.   0.15]
 [0.4  0.3  0.   0.1  0.   0.05]
 [0.75 0.7  0.   0.   1.   0.1 ]
 [0.4  0.2  0.   0.2  0.   0.1 ]
 [0.65 0.65 0.   0.   0.   0.1 ]
 [0.3  0.   0.3  0.   0.   0.1 ]
 [0.45 0.4  0.   0.05 0.   0.1 ]
 [0.25 0.25 0.   0.   0.   0.1 ]
 [0.4  0.25 0.   0.15 0.   0.05]
 [0.6  0.4  0.05 0.15 0.   0.1 ]
 [0.45 0.4  0.   0.05 0.   0.1 ]
 [0.45 0.35 0.   0.1  0.   0.15]
 [0.55 0.   0.   0.55 0.   0.1 ]
 [0.35 0.25 0.   0.1  0.   0.05]
 [0.4  0.   0.   0.4  0.   0.1 ]]
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0

In [13]:
from keras.layers import Input, Dense
from keras.models import Model
from keras.optimizers import Adam

def create_model():
    
    lyr_input = Input(shape=(6,))
    lyr_hiddn = Dense(units=64, activation="relu")(lyr_input)
    lyr_hiddn = Dense(units=32, activation="relu")(lyr_hiddn)
    lyr_hiddn = Dense(units=16, activation="relu")(lyr_hiddn)
    lyr_hiddn = Dense(units=8, activation="relu")(lyr_hiddn)
    lyr_hiddn = Dense(units=8, activation="relu")(lyr_hiddn)
    lyr_outpt = Dense(units=5, activation="softmax")(lyr_hiddn)
    
    model = Model(inputs=lyr_input, outputs=lyr_outpt)
    model.summary()

    model.compile(optimizer="adam", loss="categorical_crossentropy")
    
    return model
    
model = create_model()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 6)]               0         
                                                                 
 dense (Dense)               (None, 64)                448       
                                                                 
 dense_1 (Dense)             (None, 32)                2080      
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 8)                 136       
                                                                 
 dense_4 (Dense)             (None, 8)                 72        
                                                                 
 dense_5 (Dense)             (None, 5)                 45    

In [14]:
# 学習開始
# 検証用のデータ割合：20%
model.fit(data_x, data_y, epochs=20, batch_size=100, validation_split=0.2)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13ab2d07280>

In [15]:
np.set_printoptions(suppress = True)

def test_password(password):
    
    # １つのパスワードを確認
    x = password_to_input(password)
    x = [x]
    x = np.array(x)
    
    # 推定
    y = model.predict(x)[0]

    print("Prediction:", y)
    
    # もっとも信頼性が高いものがどれ？
    best = np.argmax(y)
    confidence = y[best] * 100
    
    # 強さを1~5で表示
    print("Password Strength", best + 1)
    
    # 信頼性も表示する
    print("Confidence:", confidence)

In [16]:
test_password("password")

Prediction: [0.18460153 0.28672826 0.52471685 0.00395205 0.00000136]
Password Strength 3
Confidence: 52.4716854095459


In [17]:
test_password("1sdssig3!")

Prediction: [0.01036266 0.11609212 0.23350212 0.64004177 0.00000134]
Password Strength 4
Confidence: 64.00417685508728


In [18]:
test_password("abcd1234")

Prediction: [0.02556391 0.26048806 0.712221   0.00172674 0.00000025]
Password Strength 3
Confidence: 71.22210264205933


In [19]:
test_password("Ab#d!23X")

Prediction: [0.07296753 0.2380637  0.6867323  0.00223631 0.00000014]
Password Strength 3
Confidence: 68.6732292175293


In [20]:
test_password("1e@332!!#asAAS")

Prediction: [0.0003262  0.00232525 0.00707003 0.06900246 0.92127603]
Password Strength 5
Confidence: 92.12760329246521


In [21]:
test_password("aaaaaaaaaaaaaa")

Prediction: [0.9941924  0.00031807 0.00009416 0.0009071  0.00448819]
Password Strength 1
Confidence: 99.41924214363098


In [22]:
test = "11111111111"
print(password_to_input(test))
test_password(test)

[0.55, 0.0, 0.0, 0.55, 0, 0.55]
Prediction: [0.9956672  0.00096312 0.00040482 0.00296348 0.0000013 ]
Password Strength 1
Confidence: 99.56672191619873
