In [1303]:
# Print your Emory ID here
print("Emory ID: 2643551")

Emory ID: 2643551


In [1304]:
# Import the necessary libraries and load the data
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re as r
data = pd.read_csv("passwords.csv")

1. An online password attack is when someone tries to hack your account by simply trying a very large number of username/password combinations to access your account. For each `password` in our dataset, the `value` column shows the amount of time it is estimated to take an “online password attack” to hack your account. The column `time_unit` shows the units of that time value (e.g., hours, days, years, etc.). It would be much nicer if our values were of the same units so we can more easily compare the “online password guessing time” for each password. So your first task is to convert all of the values to units of hours (assume the conversion units provided below, e.g., 1 day is 24 hours, 1 week is 168 hours, etc).

In [1305]:
# Your answer here 

units = {
    "seconds": 1 / 3600,
    "minutes": 1 / 60,
    "days": 24,
    "weeks": 168,
    "months": 720,
    "years": 8760,
}
data['value'] = data.apply(lambda row: row['value'] * units.get(row['time_unit'], 1), axis=1)

2. Are there any missing values in the dataset? If so, how many? After you have counted the, please drop them from the dataset.

In [1306]:
# Your answer here
missing_per_column = data.isnull().sum()
print("Missing values per column:\n", missing_per_column)
data = data.dropna()

Missing values per column:
 rank                 7
password             7
category             7
value                7
time_unit            7
offline_crack_sec    7
rank_alt             7
strength             7
font_size            7
dtype: int64


3. Which password category has the highest average strength?


In [1307]:
# Your answer here
average_strength = data.groupby('category').agg(av = ('strength','mean'))
highest_value_row = average_strength.loc[average_strength['av'].idxmax()]
highest_value_row

av    14.0
Name: nerdy-pop, dtype: float64

4. What is the rank of the password 'qwerty'?

In [1308]:
# Your answer here
qwerty_password = "qwerty"
qwerty = data.query('password == @qwerty_password')
qwerty['rank_alt']

4    5.0
Name: rank_alt, dtype: float64

5. How many passwords have more than 8 characters?

In [1309]:
# Your answer here
long_passwords_count = data[data['password'].str.len() > 8].shape[0]
long_passwords_count

1

6. How many passwords do not contain a number?

In [1310]:
# Your answer here
no_number_passwords_count = data[~data['password'].str.contains(r'\d', regex = True)].shape[0]
no_number_passwords_count

446

7. What is the average time in hours needed to crack these passwords that begin with `123`? How does this compare to the average of all passwords in the dataset?

In [1311]:
# Your answer here
average_hours_to_crack_123 = data[data['password'].str.startswith("123")]['value'].mean()
average_hours_to_crack = data['value'].mean()
print(average_hours_to_crack)
print(average_hours_to_crack_123)
#The one that starts with 123 takes less time to crack on average

13917.565247227776
107.30332438271606


8. What percentage of passwords are of the `simple-alphanumeric` category?

In [1312]:
# Your answer here
a = 'simple-alphanumeric'
number_simple_alphanumeric = data.query('category == @a')['category'].count()
percentage = (number_simple_alphanumeric/data['password'].count())*100
print(percentage)

12.2


9. How many passwords have a strength rating below 5? What is their percentage?

In [1313]:
# Your answer here
strength_rating_below_5 = data.query('strength < 5')['password'].count()
print(strength_rating_below_5)
percentage_strength_rating_below_5 = (strength_rating_below_5/data['password'].count())*100
print(percentage_strength_rating_below_5)

73
14.6


10. What is the most common password in the dataset?

In [1314]:
# Your answer here
most_common_password = data['password'].mode()[0]
most_common_password

'1111'

11. How many unique passwords are there in each category, and what is their average length?

In [1315]:
# Your answer here
unique_passwords_per_category = data.groupby('category')['password'].nunique()
print(unique_passwords_per_category)
unique_passwords = data.groupby('category')['password'].unique()
password_lengths = unique_passwords.apply(lambda passwords: [len(p) for p in passwords])
average_length_per_category = password_lengths.apply(lambda lengths: sum(lengths) / len(lengths))
print(average_length_per_category)

category
animal                  29
cool-macho              79
fluffy                  44
food                    11
name                   183
nerdy-pop               30
password-related        15
rebellious-rude         11
simple-alphanumeric     61
sport                   37
Name: password, dtype: int64
category
animal                 6.206897
cool-macho             6.253165
fluffy                 5.795455
food                   6.090909
name                   6.218579
nerdy-pop              6.633333
password-related       6.333333
rebellious-rude        6.363636
simple-alphanumeric    5.934426
sport                  6.513514
Name: password, dtype: float64


12. What is the median rank and strength for passwords of each length?

In [1316]:
data['password_length'] = data['password'].str.len()
median_rank_strength = data.groupby('password_length').agg({'rank': 'median', 'strength': 'median'})
median_rank_strength

Unnamed: 0_level_0,rank,strength
password_length,Unnamed: 1_level_1,Unnamed: 2_level_1
4,326.0,6.0
5,343.0,7.0
6,231.0,7.0
7,222.0,8.0
8,286.0,8.0
9,49.0,4.0


13. Add a column `strength_label` with values `weak` if strength is below 5, else `strong`. Use a lambda function.

In [1317]:
# Your answer here
data['strength_label'] = data['strength'].apply(lambda x:'weak' if x<5 else 'strong')

14. Create a column `high_value` with a value of `True` if value is in the top 10% of values in the dataset.

In [1318]:
# Your answer here
threshold = data['strength'].quantile(0.90)
data['high_value'] = data['value'].apply(lambda x: True if x>threshold else False)

15. Join all passwords within each `category` into a single string.

In [1319]:
# Your answer here
joined_passwords_per_category = data.groupby('category')['password'].agg(lambda x: ''.join(x)).reset_index()
joined_passwords_per_category

Unnamed: 0,category,password
0,animal,dragonmonkeybigdogfalconphoenixtigerschickenbu...
1,cool-macho,mustangshadowmasterharleyhunterrangerbusterkil...
2,fluffy,lovesunshinesilverorangegingersummerprincessdi...
3,food,peppercheesecoffeebananabuttermuffinhotdogappl...
4,name,michaeljenniferjordansupermanthomastiggerrober...
5,nerdy-pop,starwarscomputermerlinmatrixsnoopyboomercompaq...
6,password-related,passwordletmeintestpassaccessgatewaypleasewelc...
7,rebellious-rude,bitemefreedomsecretwhateverbadboysexsexbutthea...
8,simple-alphanumeric,123456123456781234qwerty12345696969abc12311111...
9,sport,baseballfootballsoccerhockeydallasyankeesgolfe...


**Bonus question 01**: What is the correlation between password length and strength? Create a scatter plot with regression line.

In [1320]:
# Your answer here

**Bonus question 02**: Find all passwords with palindromes.

In [1321]:
# Your answer here