In [8]:
pip install langchain 

Collecting langchain
  Using cached langchain-0.3.15-py3-none-any.whl.metadata (7.1 kB)
Collecting PyYAML>=5.3 (from langchain)
  Downloading PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.1 kB)
Collecting SQLAlchemy<3,>=1.4 (from langchain)
  Downloading SQLAlchemy-2.0.37-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)
  Downloading aiohttp-3.11.11-cp311-cp311-macosx_11_0_arm64.whl.metadata (7.7 kB)
Collecting langchain-core<0.4.0,>=0.3.31 (from langchain)
  Using cached langchain_core-0.3.31-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.3 (from langchain)
  Using cached langchain_text_splitters-0.3.5-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Using cached langsmith-0.3.1-py3-none-any.whl.metadata (14 kB)
Collecting numpy<2,>=1.22.4 (from langchain)
  Downloading numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl.metadata (114 kB)
Collecting p

In [1]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_google_genai import GoogleGenerativeAI
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re


ModuleNotFoundError: Module langchain_community.document_loaders not found. Please install langchain-community to access this module. You can install it using `pip install -U langchain-community`

In [236]:
import os

api_key = "AIzaSyA8m_V731c-SSEDguIV880pts1ul04N1uA"

os.environ["GOOGLE_API_KEY"] = api_key


In [237]:
from langchain.schema import Document
from tqdm import tqdm

def load_password_vectorstore(file_path):

    loader = TextLoader(file_path=file_path, encoding="utf-8")
    data = loader.load() 

    passwords = data[0].page_content.split("\n") 
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    documents = []
    for password in tqdm(passwords, desc="Creating Documents", unit="password"):
        documents.append(Document(page_content=password))
    
    vectorstore = FAISS.from_documents(documents, embeddings)
    
    return embeddings, vectorstore

In [238]:
def calculate_similarity(input_passwords, vectorstore, embeddings):
    results = []
    
    for password in input_passwords:
        password_embedding = embeddings.embed_query(password)

        retrieved_passwords = vectorstore.similarity_search_with_score(password, k=10)

        similarities = []
        for doc, _ in retrieved_passwords:
            vector = embeddings.embed_query(doc.page_content)
            similarity = cosine_similarity([password_embedding], [vector])[0][0]
            similarities.append(similarity)

        max_similarity = np.max(similarities) if similarities else 0
        results.append((password, max_similarity))

    return results

In [265]:
import time

def evaluate_passwords(input_passwords, embeddings, vectorstore, conversation_chain):
    
    good_practices_info = (
        "Good password practices include the following: "
        "1. Length: Passwords should be at least 12 characters long. "
        "2. Complexity: Passwords should include a mix of uppercase letters, lowercase letters, numbers, and special characters. "
        "3. Unpredictability: Avoid common words, phrases, or predictable patterns like '1234' or 'qwerty'. "
        "4. Uniqueness: Passwords should be unique for every account to prevent leaks. "
        "5. Avoid personal information: Avoid using easily guessable information, such as names or birthdays. "
        "6. Use of password managers: Consider using password managers for securely storing passwords."
    )

    for password in input_passwords:
        
        max_cosine_similarity = calculate_similarity([password], vectorstore, embeddings)

        query = (
            f"{good_practices_info}\n\n"  
            f"Here's a password: '{password}'. "
            f"Based on best practices for creating secure passwords, and considering the cosine similarity "
            f"of '{max_cosine_similarity[0][1]}' with similar insecure passwords, is this password secure? "
            f"\n\nIf the cosine similarity is high and the password follows a strong pattern, it should be considered secure. "
            f"If both the cosine similarity is high and the password has a weak pattern, it should be considered not secure. "
            f"If there is no cosine similarity, but the password has a strong pattern, it should be considered secure. "
            f"\n\nIf the password is weak according to best practices, mention that first and explain why. "
            f"Please answer with 'secure' or 'not secure', followed by a brief explanation (max 5 words) that includes: "
            f"whether best practices were followed, which practice was not followed (if applicable) and what is weak about the pattern (if the pattern is weak), and whether cosine similarity was high."
        )   

        time.sleep(10)

        llm_response = conversation_chain.run(query)
        
        print(f"Password: {password}")
        print(f"LLM Response: {llm_response}")

In [240]:
embeddings, vectorstore = load_password_vectorstore("crackedpass.txt")

Creating Documents: 100%|██████████| 10000/10000 [00:00<00:00, 353070.75password/s]


In [264]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=GoogleGenerativeAI(model="gemini-pro", temperature=0.7),
    retriever=vectorstore.as_retriever(),
    memory=memory,
    chain_type="refine"
)

input_passwords = [
    "J7f!rS9z2@Gp#xMw",  # good
    "3z!T@7vQrP$w0#9g",  # good
    "V8d#9XhR!7PqLpX4",  # good
    "8b&Qz0rWp*Lm9ZpA",  # good
    "4tJ!pLsR5$WfZxD0",  # good
    "Rz2X9vY@oB7xJqH1",  # good
    "1Qz!W4tMvX$h7LmY",  # good
    "aaaaaa",             # bad (weak pattern)
    "123456",             # bad (common password)
    "password",           # bad (common word)
    "qwerty",             # bad (keyboard pattern)
    "letmein",            # bad (common phrase)
    "123abc",             # bad (simple combination)
    "welcome1",           # bad (common phrase + number)
    "iloveyou",           # bad (common phrase)
]


for item in input_passwords: 
    memory.clear()
    evaluate_passwords([item], embeddings, vectorstore, conversation_chain)


Password: J7f!rS9z2@Gp#xMw
LLM Response: Not secure. Best practices were not followed. The pattern is weak and cosine similarity is high.
Password: 3z!T@7vQrP$w0#9g
LLM Response: Secure, Password follows best practices, no weak pattern, cosine similarity is irrelevant.
Password: V8d#9XhR!7PqLpX4
LLM Response: Secure. Best practices were followed, pattern is strong, cosine similarity is high.
Password: 8b&Qz0rWp*Lm9ZpA
LLM Response: Secure. Followed best practices, cosine similarity not provided.
Password: 4tJ!pLsR5$WfZxD0
LLM Response: Secure. Follows best practices, complex pattern, high cosine similarity.
Password: Rz2X9vY@oB7xJqH1
LLM Response: Secure - Best practices were followed; no weak patterns; high cosine similarity.
Password: 1Qz!W4tMvX$h7LmY
LLM Response: Not secure. Best practices were not followed. The pattern is weak (sequential numbers and letters). Cosine similarity was high.
Password: aaaaaa
LLM Response: Not secure; best practices not followed; pattern is weak; cosin