<a href="https://colab.research.google.com/github/AhWhale/CapstonePasswordEvaluator/blob/main/CapstonePasswordEval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import re
import matplotlib.pyplot as plt
import seaborn as sns

file_name = 'selected_passwords.csv'


def calculate_strength(password):
    if len(password) < 6:
        return 0  # Automatically return "Very Weak" for short passwords

    length_score = min((len(password) / 8) ** 2.5, 1)  # Further increased exponential boost
    digit_score = min(sum(c.isdigit() for c in password) / 2, 1)
    upper_score = min(sum(c.isupper() for c in password) / 2, 1)
    lower_score = min(sum(c.islower() for c in password) / 2, 1)
    special_score = min(sum(c in '!@#$%^&*()_+-={}[]|\\:;"\'<>,.?/~`' for c in password) * 2, 1)  # Further increased weight

    char_types = sum([
        bool(re.search(r'[a-z]', password)),
        bool(re.search(r'[A-Z]', password)),
        bool(re.search(r'[0-9]', password)),
        bool(re.search(r'[!@#$%^&*()_+-={}[]|\\:;"\'<>,.?/~`]', password))
    ])
    diversity_score = char_types / 4

    no_repeat_bonus = 0.2 if len(password) == len(set(password)) else 0

    sequential_penalty = 0.2 if re.search(r'(012|123|234|345|456|567|678|789|abc|bcd|cde|def|...)',
                                          password.lower()) else 0

    total_score = (
                          length_score * 4 +  # Further increased weight
                          digit_score +
                          upper_score +
                          lower_score +
                          special_score * 3 +  # Further increased weight
                          diversity_score
                  ) / 10

    # Apply bonuses/penalties
    total_score += no_repeat_bonus - sequential_penalty

    # Adjusted thresholds for classification
    if total_score < 0.3:
        return 0  # Very Weak
    elif total_score < 0.5:
        return 1  # Weak
    elif total_score < 0.7:
        return 2  # Medium
    elif total_score < 0.9:
        return 3  # Strong
    else:
        return 4  # Very Strong

def extract_features(passwords):
    return np.array([
        [len(p) for p in passwords],
        [sum(c.isdigit() for c in p) for p in passwords],
        [sum(c.isupper() for c in p) for p in passwords],
        [sum(c.islower() for c in p) for p in passwords],
        [sum(c in '!@#$%^&*()_+-={}[]|\\:;"\'<>,.?/~`' for c in p) for p in passwords],
        [len(set(p)) / len(p) if p else 0 for p in passwords]
    ]).T

# Read and process the file
print("Processing passwords...")
df = pd.read_csv(file_name, header=None, names=['password'], nrows=1000000)
df['password'] = df['password'].astype(str)

X = extract_features(df['password'])
y = df['password'].apply(calculate_strength).values

print("Finished processing all passwords.")

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a DataFrame for easier visualization
X_df = pd.DataFrame(X, columns=['length', 'num_digits', 'num_uppercase', 'num_lowercase', 'num_special', 'char_diversity'])

# Sample a smaller subset for visualization
sample_size = min(10000, len(X_df))
sample_df = X_df.sample(n=sample_size, random_state=42)
sample_y = y[sample_df.index]  # Align sample_y with sample_df
avg_features = sample_df.groupby(sample_y).mean()
# 1. Password Strength Distribution
plt.figure(figsize=(8, 8))
strength_counts = np.unique(sample_y, return_counts=True)[1]
plt.pie(strength_counts, labels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'], autopct='%1.1f%%', startangle=90)
plt.title('Password Strength Distribution')
plt.axis('equal')  # Equal aspect ratio ensures that pie chart is circular.
plt.savefig('password_strength_distribution_pie.png', dpi=100)
plt.close()

# 2. Box Plots for Feature Analysis
plt.figure(figsize=(12, 8))
sns.boxplot(x=sample_y, y=sample_df['length'])
plt.title('Password Length by Strength Category')
plt.xlabel('Strength Category')
plt.ylabel('Password Length')
plt.xticks(range(5), ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'])
plt.savefig('length_by_strength_boxplot.png', dpi=100)
plt.close()

# 3. Strength Category Comparison Bar Chart
avg_features.plot(kind='bar', figsize=(12, 6))
plt.title('Average Feature Values by Password Strength Category')
plt.xlabel('Strength Category')
plt.ylabel('Average Feature Value')
plt.xticks(range(5), ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'], rotation=45)
plt.tight_layout()
plt.savefig('average_features_by_strength.png', dpi=100)
plt.close()

# Initialize the Random Forest classifier with updated parameters to prevent overfitting
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,  # Limiting tree depth
    min_samples_split=10,  # Minimum samples to split a node
    min_samples_leaf=5,  # Minimum samples per leaf
    max_features='sqrt',  # Limit features considered for each split
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    oob_score=True  # Out-of-Bag score as a built-in validation mechanism
)

# Perform cross-validation on the training set
print("Performing cross-validation...")
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", np.mean(cv_scores))

# Train the classifier on the training data
print("Training the Random Forest classifier...")
rf_classifier.fit(X_train, y_train)

# Evaluate on the test set
y_pred = rf_classifier.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred, target_names=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong']))

print("\nConfusion Matrix on Test Set:")
print(confusion_matrix(y_test, y_pred))
importances = rf_classifier.feature_importances_
feature_names = ['length', 'num_digits', 'num_uppercase', 'num_lowercase', 'num_special', 'char_diversity']
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances)
plt.title('Feature Importances in Random Forest Classifier')
plt.xlabel('Importance')
plt.savefig('feature_importances.png', dpi=100)
plt.close()

def evaluate_password_strength(password):
    features = extract_features([password])[0]
    strength_category = rf_classifier.predict([features])[0]

    categories = ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong']
    return categories[strength_category]

# 4. Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d',
            cmap='Blues', xticklabels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'],
            yticklabels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png', dpi=100)
plt.close()

# 5. Feature Correlation Matrix
plt.figure(figsize=(10, 8))
correlation_matrix = sample_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Feature Correlation Matrix')
plt.savefig('feature_correlation_matrix.png', dpi=100)
plt.close()

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

# Reduce dimensions for visualization
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

# Plot clusters
plt.figure(figsize=(10, 8))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c=clusters, cmap='viridis', alpha=0.5)
plt.title('Password Clusters Based on Features')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.savefig('password_clusters.png', dpi=100)
plt.close()
if __name__ == "__main__":
    print("Visualizations have been saved as PNG files in the current directory.")
    while True:
        password = input("Enter a password to evaluate (or 'q' to quit): ")
        if password.lower() == 'q':
            break
        strength = evaluate_password_strength(password)
        print(f"Password strength: {strength}")



Processing passwords...
Finished processing all passwords.
Performing cross-validation...
Cross-validation scores: [0.99938125 0.9994375  0.99948125 0.99949375 0.999625  ]
Average cross-validation score: 0.9994837495312471
Training the Random Forest classifier...

Classification Report on Test Set:
              precision    recall  f1-score   support

   Very Weak       1.00      1.00      1.00     32236
        Weak       1.00      1.00      1.00    127996
      Medium       1.00      1.00      1.00     30040
      Strong       1.00      1.00      1.00      8296
 Very Strong       1.00      1.00      1.00      1432

    accuracy                           1.00    200000
   macro avg       1.00      1.00      1.00    200000
weighted avg       1.00      1.00      1.00    200000


Confusion Matrix on Test Set:
[[ 32236      0      0      0      0]
 [    25 127893     78      0      0]
 [     0      0  30031      9      0]
 [     0      0      2   8289      5]
 [     0      0      0      

In [None]:
from google.colab import drive
drive.mount('/content/drive')