# 相関を確認

同じデータセットでも、余計データを含まれている可能性があるので、入力変数はお互いに相関がないかを確認したほうが良い。

In [1]:
import pandas as pd

In [2]:
# 生のデータセットを読み込む
def load_raw_dataset():

    import csv
    
    raw_dataset = []

    csv_file = open("data/password_strength.csv", newline='', encoding="utf-8_sig")
    reader = csv.reader(csv_file)

    # csv ファイルの各行を確認
    for line in reader:
         
        password = line[0]
        strength = line[1]
        
        # 強さが数値じゃなければ、使わない
        if not strength.isnumeric():
            continue
    
        raw_dataset.append((password, int(strength)))

    return raw_dataset

In [3]:
# 読み込んで、最初の20件を表示
raw_dataset = load_raw_dataset()
print(raw_dataset[0:20])

[('nahpets', 0), ('sasa2727', 1), ('lilboost', 2), ('4408tiiyt', 3), ('schuessler', 3), ('1033704045', 3), ('smurfy23', 2), ('milagrsy gernys', 4), ('7h0mp50n', 0), ('wutsuphomeboy', 3), ('TERESA', 0), ('hoddboty1', 3), ('patty', 0), ('lucky038', 1), ('Tummykiwi151', 4), ('pipsonly1', 3), ('akamaru16', 3), ('45192480093', 3), ('smurf26', 2), ('25761838', 2)]


In [4]:
def count_letters(password):
    
    count_length = 0
    count_small  = 0
    count_large  = 0
    count_number = 0
    count_symbol = 0   
        
    for ch in password:
        
        count_length += 1
        
        if ch.isdigit():
            count_number += 1
        elif ch.islower():
            count_small  += 1
        elif ch.isupper():
            count_large  += 1
        else:
            count_symbol += 1
            
    return count_length, count_small, count_large, count_number, count_symbol

In [5]:
def check_correlation(raw_dataset):
    
    data = {
        'length' : [],
        'small'  : [],
        'large'  : [],
        'number' : [],
        'symbol' : [],
        'strength':[]
    }
    
    for value in raw_dataset:
        password = value[0]
        count = count_letters(password)
        
        data['length'].append(count[0])
        data['small'].append(count[1])
        data['large'].append(count[2])
        data['number'].append(count[3])
        data['symbol'].append(count[4])
        data['strength'].append(value[1])
    
    df = pd.DataFrame(data,columns=['length','small','large','number','symbol','strength'])

    corrMatrix = df.corr()
    print (corrMatrix)


In [6]:
check_correlation(raw_dataset)

            length     small     large    number    symbol  strength
length    1.000000  0.823502  0.004417  0.257966  0.793979  0.338257
small     0.823502  1.000000 -0.212393 -0.233534  0.604880  0.283336
large     0.004417 -0.212393  1.000000 -0.124606 -0.000108 -0.072549
number    0.257966 -0.233534 -0.124606  1.000000  0.097025  0.215947
symbol    0.793979  0.604880 -0.000108  0.097025  1.000000  0.084607
strength  0.338257  0.283336 -0.072549  0.215947  0.084607  1.000000


上記の出力をみると、各変数はお互いに強い相関(0.8以上 or -0.8以下）がないと考えられる