In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns   
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier


### Formulate research Questions
**1.** Load the dataset and check the dataset structure

In [None]:
data = pd.read_csv('dataset.csv')

print(data.shape)

data.head()

In [None]:
data.info()

data.isnull().sum()

In [None]:
value_counts = data['strength'].value_counts()
plt.bar(value_counts.index, value_counts.values)
plt.xlabel('Strength')
plt.ylabel('Count')
plt.show()

We can see that the data structure is very simple, the data labels are only two columns and there are no null values.

In this dataset, the highest rating of password strength is 2 and the lowest rating is 0. The outliers in the dataset can be found by finding the number of columns 'strength' that are greater than 2 and less than 0.

In [None]:
outliers = any((data['strength'] > 2) | (data['strength'] < 0))
print(outliers)

Since there are no nulls or outliers in the dataset, the data cleaning process can be skipped.

In this worksheet, the research objective is to determine the strength of the password, we need to find the features in the password that can affect the strength, in this step, these features can be obtained by splitting the information in the password part of the dataset

### Analyse password features



In [None]:
def digits_number(password: str):
    return sum(c.isdigit() for c in password)

def lower_letters(password: str):
    return sum(c.islower() for c in password)

def upper_letters(password: str):
    return sum(c.isupper() for c in password)

def special_chars(password: str):
    return sum(not c.isalnum() for c in password)

def password_length(password: str):
    return len(password)

data_features = data.copy()

data_features = data_features.password.agg([password_length, digits_number, lower_letters, upper_letters, special_chars])
data_features['level'] = data['strength']

print(data_features.shape)
data_features.head()

In [None]:
data_features.info()

In [76]:
#data_features.to_csv('features.csv', index=False)

In [90]:
features = data_features.drop('level', axis=1)
target = data_features['level']

def score_model(features, target, model, scoring='f1_macro'):
    scores = cross_val_score(model, features, target, cv=5, scoring=scoring)
    return scores.mean()

In [100]:
def check_confusion_matrix(features, target, model):
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.4, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    model_name = model.__class__.__name__

    sp = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    sp.set_title(f'Confusion matrix for {model_name}')
    sp.set(xlabel='Predicted label', ylabel='True label')
    plt.show()

In [None]:
score = score_model(features, target, LogisticRegression(max_iter=500))
print(f'Logistic Regression score: {score}')

logreg_model = LogisticRegression(max_iter=500)
logreg_model.fit(features, target)

coefficients = logreg_model.coef_[0]

feature_importance = zip(features.columns, coefficients)
feature_importance = sorted(feature_importance, key=lambda x: x[1], reverse=True)

for feature, coef in feature_importance:
    print(f'Feature: {feature} - Coef: {coef}')


In [None]:
plt.bar(range(len(logreg_model.coef_[0])), logreg_model.coef_[0])
plt.xlabel('Feature')
plt.ylabel('Coef')
plt.title('Feature importance')
plt.show()

In [None]:
check_confusion_matrix(features, target, LogisticRegression(max_iter=500))

In [None]:
score = score_model(features, target, MLPClassifier(hidden_layer_sizes=(100,), max_iter=500))
print(f'MLP Classifier score: {score}')

In [None]:
check_confusion_matrix(features, target, MLPClassifier(hidden_layer_sizes=(100,), max_iter=500))

In [None]:
score = score_model(features, target, SGDClassifier())
print(f'SGDClassifier score: {score}')

In [None]:
check_confusion_matrix(features, target, SGDClassifier())

In [None]:
score = score_model(features, target, RandomForestClassifier())
print(f'RandomForestClassifier score: = {score}')

In [None]:
check_confusion_matrix(features, target, RandomForestClassifier())

In [None]:
sns.pairplot(data_features, hue='level', palette='bright')
plt.show()

In [None]:
def find_duplicate_strings(password: str):
    return max(password.count(c) for c in password)

data_after = pd.DataFrame({'duplicate': data['password'].apply(find_duplicate_strings)})
data_after = pd.concat([data_features, data_after], axis=1)

data_after.head()