In [None]:
import pandas as pd
import numpy as np
import sqlite3

In [None]:
conn = sqlite3.connect('data/passwords_large.db')
df = pd.read_sql_query('SELECT * from users', conn)
df.head()

In [None]:
# Add prompt with required values that each row entry has
df['prompt'] = 'Username is ' + df['name'] + '\nEmail is ' + df['email'] + '\n'
# Add in other column values to prompt if value is not None
df.loc[df['realname'].notna(), 'prompt'] = df['prompt'] + 'Real name is ' + df['realname'] + '\n'
df.loc[df['dob'].notna(), 'prompt'] = df['prompt'] + 'Date of Birth is ' + df['dob'] + '\n'
df.loc[df['gender'] == 'F', 'prompt'] = df['prompt'] + 'Gender is Female\n'
df.loc[df['gender'] == 'M', 'prompt'] = df['prompt'] + 'Gender is Male\n'
df.loc[df['country'].notna(), 'prompt'] = df['prompt'] + 'Country is ' + df['country'] + '\n'
df.loc[df['twitterid'].notna(), 'prompt'] = df['prompt'] + 'Twitter ID is ' + df['twitterid'] + '\n'
df.loc[df['tumblrpassword'].notna(), 'prompt'] = df['prompt'] + 'Tumblr Password is ' + df['tumblrpassword'] + '\n'
df.loc[df['tumblrpassword'].notna(), 'prompt'] = df['prompt'] + 'Tumblr Password is ' + df['tumblrpassword'] + '\n'
df.loc[df['about'].notna(), 'prompt'] = df['prompt'] + 'User information: ' + df['about'] + '\n'
df.loc[df['status'].notna(), 'prompt'] = df['prompt'] + 'User status: ' + df['status'] + '\n'

df['prompt'] += 'Password: \n###\n'

df['completion'] = ' ' + df['password'] + '\n'

In [None]:
finetune_input = df[['prompt','completion']].sample(n=1000)
finetune_input

In [None]:
# Export fine tuning input to JSONL format
finetune_input.to_json('data/finetune_input_1000.jsonl', orient='records', lines=True)

### Testing & Analysis

In [None]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

models = {
    "100": {
        "model": "ada:ft-acm-research-password-team:initial-model-test-2022-04-15-22-39-03",
        "stop": "\n\n###\n\n"
    },
    "1k": {
        "model": "ada:ft-acm-research-password-team:1k-examples-2022-04-16-17-55-28",
        "stop": "\n"
    },
    "10k": {
        "model": "ada:ft-acm-research-password-team:10k-examples-2022-04-16-18-15-21",
        "stop": "\n"
    }
}

def get_GPT3_completion(prompt):
    response = openai.Completion.create(
        model=models['10k']['model'],
        prompt=prompt,
        max_tokens=32,
        temperature=0.8,
        stop=models['10k']['stop']
    )
    return response['choices'][0]['text']

In [None]:
top_100_passwords = np.loadtxt('data/top100_passwords.txt', dtype='str')
top_100_passwords

In [None]:
import time
from timeit import default_timer as timer
from datetime import timedelta
import Levenshtein
import difflib

In [None]:
sample = df.sample(n = 1000)
sample['guesses_model10k'] = 0
sample['guessed_passwords'] = ''
sample['levenshtein_similarity'] = 0.0
sample['difflib_similarity'] = 0.0
count = 0
progress = 10
max_num_guesses = 20

print("Beginning analysis...")
start = timer()

for i, row in sample.iterrows():
  count += 1
  # call fine tuned model max_num_guesses times and check if it matches the password
  guessed_passwords = ''
  num_guesses = 0
  leven_max_similarity = 0.0
  difflib_max_similarity = 0.0
  for j in range(1,max_num_guesses+1):
    guessed_password = get_GPT3_completion(row['prompt']).lstrip()
    guessed_passwords += guessed_password

    # compute how similar the guessed password is to the actual password
    leven_similarity = Levenshtein.ratio(row['password'], guessed_password)
    if leven_similarity > leven_max_similarity:
      leven_max_similarity = leven_similarity
    
    difflib_similarity = difflib.SequenceMatcher(None, row['password'], guessed_password).ratio()
    if difflib_similarity > difflib_max_similarity:
      difflib_max_similarity = difflib_similarity
    
    # store number of guesses it took to guess the correct password
    if guessed_password == row['password']:
      num_guesses = j
      print(f"Successfully guessed password of row {count} / 1000 [index {i}] in {num_guesses} tries\n")
      break
    
    # append comma to all except last guessed password in the list
    if j < max_num_guesses:
      guessed_passwords += ','
  
  # update row in dataframe
  sample.loc[i, ['guesses_model10k', 'guessed_passwords', 'levenshtein_similarity', 'difflib_similarity']] = num_guesses, guessed_passwords, leven_max_similarity, difflib_max_similarity
  if count % progress == 0:
    now = timer()
    print(f"[{timedelta(seconds=now-start)}] Finished processing row {count} / 1000\nActual password: {row['password']} [Levenshtein Similarity: {leven_max_similarity:.2f}] [DiffLib Similarity: {difflib_max_similarity:.2f}]\nGuessed passwords: {guessed_passwords}\n")

end = timer()
print(f"Analysis Completed. Total time elapsed: {timedelta(seconds=end-start)}")
# export to pickle file
sample.to_pickle(f"output/analysis_{time.strftime('%b_%d_%Y_%H-%M-%S', time.gmtime(time.time()))}.pkl")

### Timing how long it takes to send 20 API Requests
average API request time = 16.208506 seconds / 100 = 1.621 seconds per request

In [None]:
from timeit import default_timer as timer
from datetime import timedelta
start = timer()
n = 100
row = sample.iloc[0]

for i in range(0, n):
    guessed_password = get_GPT3_completion(row['prompt'])
end = timer()
print(timedelta(seconds=end-start))


In [None]:
password_list = np.loadtxt('data/top10k_passwords.txt', dtype='str')
sample = df.sample(1000)
row = sample.iloc[0]
start = timer() 
for i, password in enumerate(password_list):
  if row['password'] == password:
    pass

end = timer()
print(timedelta(seconds=end-start))
