# 相関を確認

同じデータセットでも、余計データを含まれている可能性があるので、入力変数はお互いに相関がないかを確認したほうが良い。

In [1]:
import pandas as pd

In [2]:
# 生のデータセットを読み込む
def load_raw_dataset():

    import csv
    
    raw_dataset = []

    csv_file = open("data/password_strength.csv", newline='', encoding="utf-8_sig")
    reader = csv.reader(csv_file)

    # csv ファイルの各行を確認
    for line in reader:
         
        password = line[0]
        strength = line[1]
        
        # 強さが数値じゃなければ、使わない
        if not strength.isnumeric():
            continue
    
        raw_dataset.append((password, int(strength)))

    return raw_dataset

In [3]:
# 読み込んで、最初の20件を表示
raw_dataset = load_raw_dataset()
print(raw_dataset[0:20])

[('jk4350599', 3), ('851108wa', 2), ('mccabe2k7', 2), ('pet2309989450156', 4), ('suppose', 0), ('luis375', 1), ('psh', 0), ('lmcc30', 1), ('7js2mz844ppc', 4), ('rockinrebeljigsaw', 4), ('ammaas6.', 2), ('777jehovah', 1), ('JackSon', 0), ('thecoombe290994', 4), ('0872219239', 3), ('jokerjhoyd', 3), ('dhang21', 2), ('bakerloo', 1), ('././horner././', 3), ('yotala', 1)]


In [4]:
def count_letters(password):
    
    count_length = 0
    count_small  = 0
    count_large  = 0
    count_number = 0
    count_symbol = 0   
        
    for ch in password:
        
        count_length += 1
        
        if ch.isdigit():
            count_number += 1
        elif ch.islower():
            count_small  += 1
        elif ch.isupper():
            count_large  += 1
        else:
            count_symbol += 1
            
    return count_length, count_small, count_large, count_number, count_symbol

In [5]:
def check_correlation(raw_dataset):
    
    data = {
        'length' : [],
        'small'  : [],
        'large'  : [],
        'number' : [],
        'symbol' : [],
        'strength':[]
    }
    
    for value in raw_dataset:
        password = value[0]
        count = count_letters(password)
        
        data['length'].append(count[0])
        data['small'].append(count[1])
        data['large'].append(count[2])
        data['number'].append(count[3])
        data['symbol'].append(count[4])
        data['strength'].append(value[1])
    
    df = pd.DataFrame(data,columns=['length','small','large','number','symbol','strength'])

    corrMatrix = df.corr()
    print (corrMatrix)


In [6]:
check_correlation(raw_dataset)

            length     small     large    number    symbol  strength
length    1.000000  0.563225  0.067234  0.228025  0.293437  0.624258
small     0.563225  1.000000 -0.291455 -0.522731  0.038195  0.353665
large     0.067234 -0.291455  1.000000 -0.119877  0.011491 -0.026896
number    0.228025 -0.522731 -0.119877  1.000000 -0.049923  0.207133
symbol    0.293437  0.038195  0.011491 -0.049923  1.000000  0.091079
strength  0.624258  0.353665 -0.026896  0.207133  0.091079  1.000000


上記の出力をみると、各変数はお互いに強い相関(0.8以上 or -0.8以下）がないと考えられる