In [2]:
# This project is about building Password Strength Checker Using Machine Learning

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Lets begin by importing Python Libraries

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import cohen_kappa_score
import getpass


In [5]:
# Reading the Dataset into a Dataframe

data = pd.read_csv(r'C:\Users\Accounts01\Desktop\Folders 1\Data set for projects\data.csv', error_bad_lines = False)

b'Skipping line 2810: expected 2 fields, saw 5\nSkipping line 4641: expected 2 fields, saw 5\nSkipping line 7171: expected 2 fields, saw 5\nSkipping line 11220: expected 2 fields, saw 5\nSkipping line 13809: expected 2 fields, saw 5\nSkipping line 14132: expected 2 fields, saw 5\nSkipping line 14293: expected 2 fields, saw 5\nSkipping line 14865: expected 2 fields, saw 5\nSkipping line 17419: expected 2 fields, saw 5\nSkipping line 22801: expected 2 fields, saw 5\nSkipping line 25001: expected 2 fields, saw 5\nSkipping line 26603: expected 2 fields, saw 5\nSkipping line 26742: expected 2 fields, saw 5\nSkipping line 29702: expected 2 fields, saw 5\nSkipping line 32767: expected 2 fields, saw 5\nSkipping line 32878: expected 2 fields, saw 5\nSkipping line 35643: expected 2 fields, saw 5\nSkipping line 36550: expected 2 fields, saw 5\nSkipping line 38732: expected 2 fields, saw 5\nSkipping line 40567: expected 2 fields, saw 5\nSkipping line 40576: expected 2 fields, saw 5\nSkipping line 

In [6]:
# Data Pre-Processing
# Checking the first 5 rows

data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [7]:
# Lets check the shape of our data

data.shape

(669640, 2)

In [8]:
# Our Data set has 669 640 rows and 2 Columns

In [9]:
# Lets check if there are null values

data.isnull().sum()

password    1
strength    0
dtype: int64

In [10]:
# as it can be seen there is a null value on Password, the best way to deal with this is to drop the data point with a null value

data = data.dropna()

# Lets check again

data.isnull().sum()

password    0
strength    0
dtype: int64

In [11]:
# Its all cleared

In [12]:
data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [13]:
# Our data set has two columns, password and strength columns. lets find the unique values in our strength column

data['strength'].unique()

array([1, 2, 0], dtype=int64)

In [14]:
# strength column has 3 unique numbers, 0, 1, and 2
# 0 means the password strength is weak
# 1 means the password strength is medium
# 2 means the password strength is strong
# Lets convert 0, 1, and 2 to weak, medium, and Strong.

In [15]:
# conversation

data['strength'] = data['strength'].map({0: 'Weak', 1: 'Medium', 2: 'Strong'})
data.sample(10)

Unnamed: 0,password,strength
111953,ferhat0993,Medium
240036,dpz0n2ie,Medium
529397,0147258369f,Medium
302203,bandit1963,Medium
6849,u6mOn4TI0OAeYPTx,Strong
367441,otoniel1994,Medium
164567,CHENLIN880909,Medium
430265,gitalani16,Medium
132718,keyla0609,Medium
370936,pompinti123,Medium


In [16]:
# Now lets Tokenize the passwords for our model to learn from all the combinations of digits, letters and symbols .before we train it.

def word(password):
    character = []
    for i in password:
        character.append(i)
    return character

In [17]:
# Lets change our frame to numpy array for quick computations

X = np.array(data['password'])
Y = np.array(data['strength'])

print('An array for X:', X)
print('An array for Y:', Y)

An array for X: ['kzde5577' 'kino3434' 'visi7k1yr' ... '184520socram' 'marken22a'
 'fxx4pw4g']
An array for Y: ['Medium' 'Medium' 'Medium' ... 'Medium' 'Medium' 'Medium']


In [18]:
tfid = TfidfVectorizer(tokenizer = word)

In [19]:
X = tfid.fit_transform(X)

In [20]:
# Lets train test split

Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size = 0.20, random_state = 40)

In [21]:
# Model Building

model = DecisionTreeClassifier()

In [22]:
model = DecisionTreeClassifier()
model.fit(Xtrain, Ytrain)

In [23]:
#Lets check the score on training data
model.score(Xtrain, Ytrain)

0.999740531741928

In [24]:
# Lets Test Our model

YPred = model.predict(Xtest)
YPred

array(['Medium', 'Medium', 'Weak', ..., 'Medium', 'Medium', 'Strong'],
      dtype=object)

In [25]:
# Performance Test Using Accuracy Score

accuracy_score(Ytest, YPred) #Compare with actual Y values in Ytest(hold-out) with predicted Y

0.925870617047966

In [26]:
# As it can be seen, comparing the Training score and Test score there is little difference hence there is no overfitting

In [27]:
# Performance test using Confusion Matrix

cm = confusion_matrix(Ytest, YPred) #Compare with actual Y values in Ytest(hold-out) with predicted Y
cm

array([[94685,  2064,  2521],
       [ 2191, 14215,   300],
       [ 2661,   191, 15100]], dtype=int64)

In [28]:
cohen_kappa_score(Ytest, YPred) # scores above 0.8 generally considered good agreement

0.821735910745716

In [29]:
# As seen above our model is performing greatly. let me print the general report

report = classification_report(Ytest, YPred)
print(report)

              precision    recall  f1-score   support

      Medium       0.95      0.95      0.95     99270
      Strong       0.86      0.85      0.86     16706
        Weak       0.84      0.84      0.84     17952

    accuracy                           0.93    133928
   macro avg       0.89      0.88      0.88    133928
weighted avg       0.93      0.93      0.93    133928



In [30]:
# Now lets check the strength of a Password

user = getpass.getpass('Enter Your Password: ')
data = tfid.transform([user]).toarray()
output = model.predict(data)
print(output)


['Strong']
