<a href="https://colab.research.google.com/github/AhWhale/CapstonePasswordEvaluator/blob/main/CapstonePasswordEval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import matplotlib
matplotlib.use('Agg')
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import re
import matplotlib.pyplot as plt
import seaborn as sns

# password file imports from github
url = "https://raw.githubusercontent.com/AhWhale/CapstonePasswordEvaluator/refs/heads/main/selected_passwords.csv"

# Precompile regex patterns for performance
sequential_patterns = [
    re.compile(r'(012|123|234|345|456|567|678|789)'),  # Sequential digits
    re.compile(r'(abc|bcd|cde|def|efg|fgh|ghi|hij|ijk|jkl|klm|lmn|mno|nop|opq|pqr|qrs|rst|stu|uvw|vwx|wxy|xyz)'),  # Sequential letters
    re.compile(r'(qwerty|asdfgh|zxcvbn)'),  # Keyboard patterns
    re.compile(r'([0-9])\1{2,}'),  # Three or more repeating digits
    re.compile(r'([a-zA-Z])\1{2,}')  # Three or more repeating letters
]

# Set of common weak passwords to flag and penalize more heavily
common_passwords = set(['123456', 'password', 'qwerty', 'abc123', 'password1'])

def calculate_strength(password):
    if len(password) < 6:
        return 0  # Very Weak for passwords shorter than 6 characters

    # If the password is in the common password list, it's automatically Very Weak
    if password.lower() in common_passwords:
        return 0

    # Continue with the strength calculation
    length_score = min((len(password) / 8) ** 2.5, 1)

    digit_count = sum(c.isdigit() for c in password)
    upper_count = sum(c.isupper() for c in password)
    lower_count = sum(c.islower() for c in password)
    special_count = sum(c in '!@#$%^&*()_+-={}[]|\\:;"\'<>,.?/~`' for c in password)

    digit_score = min(digit_count / 2, 1)
    upper_score = min(upper_count / 2, 1)
    lower_score = min(lower_count / 2, 1)
    special_score = min(special_count / 2, 1)

    char_types_present = [
        digit_count > 0,
        upper_count > 0,
        lower_count > 0,
        special_count > 0
    ]
    char_types_count = sum(char_types_present)


    diversity_score = (char_types_count / 4) * (0.25 * (digit_count > 0) +
                                                0.25 * (upper_count > 0) +
                                                0.25 * (lower_count > 0) +
                                                0.25 * (special_count > 0))

    # Penalty for sequential patterns and common passwords
    sequential_penalty = 0
    for pattern in sequential_patterns:
        if pattern.search(password.lower()):
            sequential_penalty += 0.25

    # Repetition penalty (same character repeated 3+ times)
    repeated_chars_penalty = 0.25 if re.search(r'(.)\1{2,}', password) else 0

    #stronger penalty for short passwords (6-7 characters)
    short_length_penalty = 0.15 if len(password) < 8 else 0

    # Bonus for no character repetitions
    no_repeat_bonus = 0.2 if len(password) == len(set(password)) else 0

    # Calculate total score, apply penalties, and bonuses
    total_score = (
        length_score * 4 +      # Give length more weight in overall score
        digit_score +
        upper_score +
        lower_score +
        special_score * 2 +     # Give special characters more weight
        diversity_score * 2     # Emphasize diversity
    ) / 10

    total_score += no_repeat_bonus - (sequential_penalty + repeated_chars_penalty + short_length_penalty)

    # Normalize total score to be between 0 and 1
    total_score = max(min(total_score, 1), 0)


    if total_score < 0.3 or (total_score < 0.4 and len(password) < 8 and char_types_count < 2):
        return 0  # Very Weak
    elif total_score < 0.5:
        return 1  # Weak
    elif total_score < 0.7:
        return 2  # Fair
    elif total_score < 0.85:
        return 3  # Strong
    else:
        return 4  # Very Strong


def extract_features(passwords):
    return np.array([
        [min(len(p), 50) for p in passwords],
        [sum(c.isdigit() for c in p) for p in passwords],
        [sum(c.isupper() for c in p) for p in passwords],
        [sum(c.islower() for c in p) for p in passwords],
        [sum(c in '!@#$%^&*()_+-={}[]|\\:;"\'<>,.?/~`' for c in p) for p in passwords],
        [len(set(p)) / len(p) if p else 0 for p in passwords]
    ]).T

# Read and process the file from URL
print("Processing passwords...")
df = pd.read_csv(url, header=None, names=['password'], nrows=1000000)
df['password'] = df['password'].astype(str)

X = extract_features(df['password'])
y = df['password'].apply(calculate_strength).values

print("Finished processing all passwords.")

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a DataFrame for easier visualization
X_df = pd.DataFrame(X, columns=['length', 'num_digits', 'num_uppercase', 'num_lowercase', 'num_special', 'char_diversity'])

# Sample a smaller subset for visualization
sample_size = min(10000, len(X_df))
sample_df = X_df.sample(n=sample_size, random_state=42)
sample_y = y[sample_df.index]
avg_features = sample_df.groupby(sample_y).mean()

# 1. Password Strength Distribution
plt.figure(figsize=(14, 14))
strength_counts = np.unique(sample_y, return_counts=True)[1]
plt.pie(strength_counts,
        labels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'],
        autopct='%1.1f%%',
        startangle=90)
plt.title('Password Strength Distribution', pad=20)
plt.axis('equal')
plt.savefig('password_strength_distribution_pie.png', dpi=100)
plt.close()

# 3. Strength Category Comparison Bar Chart
avg_features.plot(kind='bar', figsize=(12, 6))
plt.title('Average Feature Values by Password Strength Category')
plt.xlabel('Strength Category')
plt.ylabel('Average Feature Value')
plt.xticks(range(5), ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'], rotation=45)
plt.tight_layout()
plt.savefig('average_features_by_strength.png', dpi=100)
plt.close()

# Initialize Random Forest classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
    n_jobs=-1,
    oob_score=True
)

# Perform cross-validation on the training set
print("Performing cross-validation...")
cv_scores = cross_val_score(rf_classifier, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Average cross-validation score:", np.mean(cv_scores))

# Train the classifier on the training data
print("Training the Random Forest classifier...")
rf_classifier.fit(X_train, y_train)

# Evaluate on the test set
y_pred = rf_classifier.predict(X_test)
print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred, target_names=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong']))

print("\nConfusion Matrix on Test Set:")
print(confusion_matrix(y_test, y_pred))

# Feature Importances
importances = rf_classifier.feature_importances_
feature_names = ['length', 'num_digits', 'num_uppercase', 'num_lowercase', 'num_special', 'char_diversity']
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances)
plt.title('Feature Importances in Random Forest Classifier')
plt.xlabel('Importance')
plt.savefig('feature_importances.png', dpi=100)
plt.close()

# Function to evaluate password strength
def evaluate_password_strength(password):
    features = extract_features([password])[0]
    strength_category = rf_classifier.predict([features])[0]
    categories = ['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong']
    return categories[strength_category]

# 4. Confusion Matrix
plt.figure(figsize=(10, 7))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d',
            cmap='Blues', xticklabels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'],
            yticklabels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'])
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png', dpi=100)
plt.close()

# 5. Feature Correlation Matrix
plt.figure(figsize=(12, 10))
correlation_matrix = sample_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Feature Correlation Matrix')
plt.xticks(rotation=45)  # Rotate labels 45 degrees
plt.yticks(rotation=0)
plt.savefig('feature_correlation_matrix.png', dpi=100)
plt.close()

# Apply KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X)

# Reduce dimensions for visualization
pca = PCA(n_components=2)
X_reduced = pca.fit_transform(X)

# Calculate the average feature values per strength category
strength_avg_features = pd.DataFrame(X, columns=feature_names).groupby(y).mean()

# Plot the average features by strength category
plt.figure(figsize=(12, 6))
strength_avg_features.plot(kind='line', marker='o', figsize=(12, 6))
plt.title('Average Features by Password Strength')
plt.xlabel('Strength Category')
plt.ylabel('Average Feature Value')
plt.xticks(ticks=range(5), labels=['Very Weak', 'Weak', 'Medium', 'Strong', 'Very Strong'])
plt.legend(title='Features', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.savefig('features_by_strength.png', dpi=100)
plt.close()

# Calculate total character types for the entire DataFrame
total_character_types = (
    (df['password'].apply(lambda p: sum(c.islower() for c in p)) > 0).astype(int) +
    (df['password'].apply(lambda p: sum(c.isupper() for c in p)) > 0).astype(int) +
    (df['password'].apply(lambda p: sum(c.isdigit() for c in p)) > 0).astype(int) +
    (df['password'].apply(lambda p: sum(c in '!@#$%^&*()_+-={}[]|\\:;"\'<>,.?/~`' for c in p)) > 0).astype(int)
)

# Final output message
if __name__ == "__main__":
    print("Visualizations have been saved as PNG files in the current directory.")
    while True:
        password = input("Enter a password to evaluate (or 'q' to quit): ")
        if password.lower() == 'q':
            break
        strength = evaluate_password_strength(password)
        print(f"Password strength: {strength}")
