In [1]:
# import required packages
import pandas as pd

import statsmodels.api as sm

In [2]:
# read the data
df = pd.read_csv('D:\\Password-Analysis\\Datasets\\top_200_password_2020_by_country_extended.csv')
df.head()

Unnamed: 0,country_code,country,Rank,Password,User_count,Time_to_crack,Global_rank,Time_to_crack_in_seconds,length,unique_chars,numbers,symbols,symbols_and_numbers,sequentials
0,cn,China,1,123456,8159358,< 1 second,1.0,0,6,6,6,0,6,0
1,cn,China,2,123456789,1817250,< 1 second,2.0,0,9,9,9,0,9,0
2,cn,China,3,12345678,700019,< 1 second,6.0,0,8,8,8,0,8,0
3,cn,China,4,654321,245827,< 1 second,23.0,0,6,6,6,0,6,0
4,cn,China,5,1234567890,210168,< 1 second,9.0,0,10,10,10,0,10,0


In [3]:
# creating dummy variables for country column since it is categorical data
df = pd.get_dummies(df, columns=['country'])

In [4]:
df.columns

Index(['country_code', 'Rank', 'Password', 'User_count', 'Time_to_crack',
       'Global_rank', 'Time_to_crack_in_seconds', 'length', 'unique_chars',
       'numbers', 'symbols', 'symbols_and_numbers', 'sequentials',
       'country_China', 'country_Russia', 'country_Spain',
       'country_United States', 'country_Vietnam'],
      dtype='object')

## Linear Model for Length Attribute

In [5]:
# defining the X and Y 
x_length = df[['country_China', 'country_Russia', 'country_Spain',
       'country_United States', 'country_Vietnam']]
y_length = df['length']

### Linear Model with Standard Errors

In [6]:
# adding the constant term
x_length = sm.add_constant(x_length)
  
# performing the regression and fitting the model
length_model_fit_1 = sm.WLS(y_length, x_length, df['User_count']).fit()
  
# printing the summary table
print(length_model_fit_1.summary())

                            WLS Regression Results                            
Dep. Variable:                 length   R-squared:                       0.016
Model:                            WLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     3.932
Date:                Sun, 13 Mar 2022   Prob (F-statistic):            0.00357
Time:                        12:32:40   Log-Likelihood:                -2827.7
No. Observations:                1000   AIC:                             5665.
Df Residuals:                     995   BIC:                             5690.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     5.76

  x = pd.concat(x[::order], 1)


### Linear Model with Robust Standard Errors

In [7]:
# adding the constant term
x_length = sm.add_constant(x_length)
  
# performing the regression and fitting the model
length_model_fit_2 = sm.WLS(y_length, x_length, df['User_count']).fit(cov_type='HC3')
  
# printing the summary table
print(length_model_fit_2.summary())

                            WLS Regression Results                            
Dep. Variable:                 length   R-squared:                       0.016
Model:                            WLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     425.6
Date:                Sun, 13 Mar 2022   Prob (F-statistic):          3.62e-244
Time:                        12:32:40   Log-Likelihood:                -2827.7
No. Observations:                1000   AIC:                             5665.
Df Residuals:                     995   BIC:                             5690.
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     5.76

  x = pd.concat(x[::order], 1)


## Linear model for Unique Characters Attribute

In [8]:
# defining the X and Y 
x_unique_chars = df[['country_China', 'country_Russia', 'country_Spain',
       'country_United States', 'country_Vietnam']]
y_unique_chars = df['unique_chars']

### Linear Model with Standard Errors

In [9]:
# adding the constant term
x_unique_chars = sm.add_constant(x_unique_chars)
  
# performing the regression and fitting the model
unique_chars_model_fit_1 = sm.WLS(y_unique_chars, x_unique_chars, df['User_count']).fit()
  
# printing the summary table
print(unique_chars_model_fit_1.summary())

                            WLS Regression Results                            
Dep. Variable:           unique_chars   R-squared:                       0.037
Model:                            WLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     9.576
Date:                Sun, 13 Mar 2022   Prob (F-statistic):           1.34e-07
Time:                        12:32:40   Log-Likelihood:                -2971.7
No. Observations:                1000   AIC:                             5953.
Df Residuals:                     995   BIC:                             5978.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     5.31

  x = pd.concat(x[::order], 1)


### Linear Model with Robust Standard Errors

In [10]:
# adding the constant term
x_unique_chars = sm.add_constant(x_unique_chars)
  
# performing the regression and fitting the model
unique_chars_model_fit_2 = sm.WLS(y_unique_chars, x_unique_chars, df['User_count']).fit(cov_type='HC3')
  
# printing the summary table
print(unique_chars_model_fit_2.summary())

                            WLS Regression Results                            
Dep. Variable:           unique_chars   R-squared:                       0.037
Model:                            WLS   Adj. R-squared:                  0.033
Method:                 Least Squares   F-statistic:                     440.2
Date:                Sun, 13 Mar 2022   Prob (F-statistic):          3.52e-249
Time:                        12:32:40   Log-Likelihood:                -2971.7
No. Observations:                1000   AIC:                             5953.
Df Residuals:                     995   BIC:                             5978.
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     5.31

  x = pd.concat(x[::order], 1)


## Linear Model for Number of #s Attribute

In [11]:
# defining the X and Y 
x_numbers = df[['country_China', 'country_Russia', 'country_Spain',
       'country_United States', 'country_Vietnam']]
y_numbers = df['numbers']

### Linear Model with Standard Errors

In [12]:
# adding the constant term
x_numbers = sm.add_constant(x_numbers)
  
# performing the regression and fitting the model
numbers_model_fit_1 = sm.WLS(y_numbers, x_numbers, df['User_count']).fit()
  
# printing the summary table
print(numbers_model_fit_1.summary())

  x = pd.concat(x[::order], 1)


                            WLS Regression Results                            
Dep. Variable:                numbers   R-squared:                       0.073
Model:                            WLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     19.62
Date:                Sun, 13 Mar 2022   Prob (F-statistic):           1.47e-15
Time:                        12:32:40   Log-Likelihood:                -3549.6
No. Observations:                1000   AIC:                             7109.
Df Residuals:                     995   BIC:                             7134.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     3.45

### Linear Model with Robust Standard Errors

In [13]:
# adding the constant term
x_numbers = sm.add_constant(x_numbers)
  
# performing the regression and fitting the model
numbers_model_fit_2 = sm.WLS(y_numbers, x_numbers, df['User_count']).fit(cov_type='HC3')
  
# printing the summary table
print(numbers_model_fit_2.summary())

                            WLS Regression Results                            
Dep. Variable:                numbers   R-squared:                       0.073
Model:                            WLS   Adj. R-squared:                  0.069
Method:                 Least Squares   F-statistic:                     45.84
Date:                Sun, 13 Mar 2022   Prob (F-statistic):           1.11e-42
Time:                        12:32:40   Log-Likelihood:                -3549.6
No. Observations:                1000   AIC:                             7109.
Df Residuals:                     995   BIC:                             7134.
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     3.45

  x = pd.concat(x[::order], 1)


## Linear Model for Sequentials Attribute

In [14]:
# defining the X and Y 
x_sequentials = df[['country_China', 'country_Russia', 'country_Spain',
       'country_United States', 'country_Vietnam']]
y_sequentials = df['sequentials']

### Linear Model with Standard Errors

In [15]:
# adding the constant term
x_sequentials = sm.add_constant(x_sequentials)
  
# performing the regression and fitting the model
sequentials_model_fit_1 = sm.WLS(y_sequentials, x_sequentials, df['User_count']).fit()
  
# printing the summary table
print(sequentials_model_fit_1.summary())

  x = pd.concat(x[::order], 1)


                            WLS Regression Results                            
Dep. Variable:            sequentials   R-squared:                       0.027
Model:                            WLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     6.862
Date:                Sun, 13 Mar 2022   Prob (F-statistic):           1.89e-05
Time:                        12:32:40   Log-Likelihood:                -1942.0
No. Observations:                1000   AIC:                             3894.
Df Residuals:                     995   BIC:                             3919.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     0.17

### Linear Model with Robust Standard Errors

In [16]:
# adding the constant term
x_sequentials = sm.add_constant(x_sequentials)
  
# performing the regression and fitting the model
sequentials_model_fit_2 = sm.WLS(y_sequentials, x_sequentials, df['User_count']).fit(cov_type='HC3')
  
# printing the summary table
print(sequentials_model_fit_2.summary())

                            WLS Regression Results                            
Dep. Variable:            sequentials   R-squared:                       0.027
Model:                            WLS   Adj. R-squared:                  0.023
Method:                 Least Squares   F-statistic:                     5.103
Date:                Sun, 13 Mar 2022   Prob (F-statistic):           0.000126
Time:                        12:32:41   Log-Likelihood:                -1942.0
No. Observations:                1000   AIC:                             3894.
Df Residuals:                     995   BIC:                             3919.
Df Model:                           4                                         
Covariance Type:                  HC3                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     0.17

  x = pd.concat(x[::order], 1)
