In [1]:
import duckdb
con = duckdb.connect("../dataset/cert42.duckdb")

#### Aggregate device table features grouped by user

In [28]:
res = con.execute(
    """
CREATE TABLE device_user AS
SELECT
    user,
    COUNT(*) AS total_device_events,

    -- Basic counts
    SUM(CASE WHEN activity = 'connect' THEN 1 ELSE 0 END) AS connect_count,
    SUM(CASE WHEN activity = 'disconnect' THEN 1 ELSE 0 END) AS disconnect_count,
    COUNT(DISTINCT pc) AS unique_pcs_used,

    -- Time-based features
    SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
            THEN 1 ELSE 0
        END
    ) AS after_hours_connects,

    SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
            THEN 1 ELSE 0
        END
    ) AS weekend_connects,

    -- Ratios (normalize user behavior)
    CAST(SUM(CASE WHEN activity = 'connect' THEN 1 ELSE 0 END) AS FLOAT) 
      / NULLIF(COUNT(*),0) AS connect_ratio,
    CAST(SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
            THEN 1 ELSE 0 END
    ) AS FLOAT) / NULLIF(COUNT(*),0) AS after_hours_ratio,
    CAST(SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
            THEN 1 ELSE 0 END
    ) AS FLOAT) / NULLIF(COUNT(*),0) AS weekend_ratio

FROM device
GROUP BY user;
    """ 
).df()

print(res)

   Count
0    265


In [29]:
table_preview = con.execute("""
SELECT * FROM device_user LIMIT 10 
""").df()

print(table_preview)

      user  total_device_events  connect_count  disconnect_count  \
0  ZKS0899                 5968            0.0               0.0   
1  SBN0971                  350            0.0               0.0   
2  BMG0917                 5955            0.0               0.0   
3  ESH0283                  277            0.0               0.0   
4  RSC0089                 5776            0.0               0.0   
5  IIW0249                 6735            0.0               0.0   
6  HCS0003                 2006            0.0               0.0   
7  NWT0098                 1299            0.0               0.0   
8  ATP0662                  784            0.0               0.0   
9  JFC0557                 2785            0.0               0.0   

   unique_pcs_used  after_hours_connects  weekend_connects  connect_ratio  \
0                1                   0.0               0.0            0.0   
1                1                  17.0              66.0            0.0   
2                1  

#### Aggregate logon table features grouped by user

In [31]:
res = con.execute(
    """CREATE TABLE logon_user AS
SELECT
    user,
    COUNT(*) AS total_logon_events,

    -- Basic counts
    SUM(CASE WHEN activity = 'Logon' THEN 1 ELSE 0 END) AS logon_count,
    SUM(CASE WHEN activity = 'Logoff' THEN 1 ELSE 0 END) AS logoff_count,
    COUNT(DISTINCT pc) AS unique_pcs_used,

    -- Time-based features
    SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
            THEN 1 ELSE 0
        END
    ) AS after_hours_logons,

    SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
            THEN 1 ELSE 0
        END
    ) AS weekend_logons,

    -- Ratios (normalize user behavior)
    CAST(SUM(CASE WHEN activity = 'Logon' THEN 1 ELSE 0 END) AS FLOAT)
      / NULLIF(COUNT(*),0) AS logon_ratio,
    CAST(SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
            THEN 1 ELSE 0 END
    ) AS FLOAT) / NULLIF(COUNT(*),0) AS after_hours_ratio,
    CAST(SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
            THEN 1 ELSE 0 END
    ) AS FLOAT) / NULLIF(COUNT(*),0) AS weekend_ratio

FROM logon
GROUP BY user;


    """ 
).df()

print(res)



   Count
0   1000


In [32]:
table_preview = con.execute("""
SELECT * FROM logon_user LIMIT 10 
""").df()

print(table_preview)

      user  total_logon_events  logon_count  logoff_count  unique_pcs_used  \
0  MHH0180                 759        485.0         274.0                1   
1  YIC0195                1174        726.0         448.0                3   
2  RAW0915                1203        762.0         441.0               28   
3  CJM0521                 928        464.0         464.0                1   
4  ALB0892                 470        235.0         235.0                1   
5  EHB0420                1651        991.0         660.0                3   
6  JTM0223                1378        748.0         630.0              361   
7  UIR0043                2860       1430.0        1430.0              631   
8  JAV0361                2768       1384.0        1384.0              625   
9  CCA0046                1608        804.0         804.0              447   

   after_hours_logons  weekend_logons  logon_ratio  after_hours_ratio  \
0               274.0           102.0     0.638999           0.36100

#### Aggregate http table grouped by user

In [34]:
res = con.execute(
    """
   CREATE TABLE http_user AS
    SELECT
        user,
        COUNT(*) AS total_http_events,
        COUNT(DISTINCT pc) AS unique_pcs_used,
        COUNT(DISTINCT url) AS unique_urls_visited,
    
        -- Time-based features
        SUM(
            CASE
                WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                     AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
                THEN 1 ELSE 0
            END
        ) AS after_hours_http,
    
        SUM(
            CASE
                WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                     AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
                THEN 1 ELSE 0
            END
        ) AS weekend_http,
    
        -- Ratios (normalize)
        CAST(SUM(
            CASE
                WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                     AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
                THEN 1 ELSE 0 END
        ) AS FLOAT) / NULLIF(COUNT(*),0) AS after_hours_ratio,
    
        CAST(SUM(
            CASE
                WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                     AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
                THEN 1 ELSE 0 END
        ) AS FLOAT) / NULLIF(COUNT(*),0) AS weekend_ratio
    
    FROM http
    GROUP BY user;

    """ 
).df()

print(res)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

   Count
0   1000


In [35]:
table_preview = con.execute("""
SELECT * FROM http_user LIMIT 10 
""").df()

print(table_preview)

      user  total_http_events  unique_pcs_used  unique_urls_visited  \
0  GMF0738              56052                1                  182   
1  XHW0498              31690                1                  264   
2  MAD0753              56052                1                  327   
3  EPW0434              32870                1                  388   
4  JAD0609              56052                1                  190   
5  DAG0558              32870                1                  304   
6  AKR0057              26463                1                  217   
7  HWY0038              12312                1                  218   
8  KSP0966               3460                1                  166   
9  JAV0361              39377                1                  245   

   after_hours_http  weekend_http  after_hours_ratio  weekend_ratio  
0             453.0           0.0           0.008082            0.0  
1               0.0           0.0           0.000000            0.0  
2       

#### Aggregate email table features grouped by user

In [37]:
res = con.execute(
    """
  CREATE TABLE email_user AS
    SELECT
        user,
        COUNT(*) AS total_emails,                                   -- total email events
        COUNT(DISTINCT pc) AS unique_pcs_used,                      -- distinct machines used for email
        COUNT(DISTINCT "to") AS unique_recipients,                  -- number of unique recipients
        COUNT(DISTINCT cc) AS unique_cc,                            -- unique cc addresses
        COUNT(DISTINCT bcc) AS unique_bcc,                          -- unique bcc addresses
        COUNT(DISTINCT "from") AS unique_senders,                   -- in case multiple accounts used
        SUM(size) AS total_email_size,                              -- total size of emails sent
        AVG(size) AS avg_email_size,                                -- average size
        SUM(attachments) AS total_attachments,                      -- total attachments
        AVG(attachments) AS avg_attachments,                        -- average attachments per email
        SUM(CASE WHEN attachments > 0 THEN 1 ELSE 0 END) AS emails_with_attachments -- how many had attachments
    FROM email
    GROUP BY user;


    """ 
).df()

print(res)

   Count
0   1000


In [38]:
table_preview = con.execute("""
SELECT * FROM email_user LIMIT 10 
""").df()

print(table_preview)

      user  total_emails  unique_pcs_used  unique_recipients  unique_cc  \
0  AJR0319          6434                1               2778        746   
1  AMW0392          3956                1               1552        422   
2  AHC0142          3382                1                857        301   
3  SMH0291           467                1                290        111   
4  DRR0242          3836                1               1479        422   
5  LBH0942          3918                1               1551        374   
6  BSS0369          2300                1               1048        294   
7  BVC0790          3768                1               1740        499   
8  IIL0513          1329                1                694        186   
9  MCF0600           239                1                163         62   

   unique_bcc  unique_senders  total_email_size  avg_email_size  \
0           2               2       194943753.0    30298.997979   
1           2               2       1182

#### Aggregate file table features group by user

In [43]:
res = con.execute(
    """
CREATE TABLE file_user AS
SELECT
    user,
    COUNT(*) AS total_file_events,                   -- total file activities
    COUNT(DISTINCT pc) AS unique_pcs_used,           -- distinct machines used
    COUNT(DISTINCT filename) AS unique_files,        -- unique files accessed
    COUNT(DISTINCT content) AS unique_file_types,    -- unique file categories / content keywords

    -- File type counts
    SUM(CASE WHEN filename LIKE '%.exe'  THEN 1 ELSE 0 END) AS exe_files_accessed,
    SUM(CASE WHEN filename LIKE '%.zip'  THEN 1 ELSE 0 END) AS zip_files_accessed,
    SUM(CASE WHEN filename LIKE '%.pdf'  THEN 1 ELSE 0 END) AS pdf_files_accessed,
    SUM(CASE WHEN filename LIKE '%.docx' THEN 1 ELSE 0 END) AS docx_files_accessed,
    SUM(CASE WHEN filename LIKE '%.xlsx' THEN 1 ELSE 0 END) AS xlsx_files_accessed,

    -- Time-based features
    SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
            THEN 1 ELSE 0
        END
    ) AS after_hours_file_events,

    SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
            THEN 1 ELSE 0
        END
    ) AS weekend_file_events,

    -- Ratios for time-based behavior
    CAST(SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%H', strptime(date, '%m/%d/%Y %H:%M:%S')) NOT BETWEEN '08' AND '18'
            THEN 1 ELSE 0
        END
    ) AS FLOAT) / NULLIF(COUNT(*),0) AS after_hours_ratio,

    CAST(SUM(
        CASE
            WHEN CAST(strptime(date, '%m/%d/%Y %H:%M:%S') AS TIMESTAMP) IS NOT NULL
                 AND strftime('%w', strptime(date, '%m/%d/%Y %H:%M:%S')) IN ('0','6')
            THEN 1 ELSE 0
        END
    ) AS FLOAT) / NULLIF(COUNT(*),0) AS weekend_ratio,

    -- File type ratios
    CAST(SUM(CASE WHEN filename LIKE '%.exe'  THEN 1 ELSE 0 END) AS FLOAT) / NULLIF(COUNT(*),0) AS exe_ratio,
    CAST(SUM(CASE WHEN filename LIKE '%.zip'  THEN 1 ELSE 0 END) AS FLOAT) / NULLIF(COUNT(*),0) AS zip_ratio,
    CAST(SUM(CASE WHEN filename LIKE '%.pdf'  THEN 1 ELSE 0 END) AS FLOAT) / NULLIF(COUNT(*),0) AS pdf_ratio,
    CAST(SUM(CASE WHEN filename LIKE '%.docx' THEN 1 ELSE 0 END) AS FLOAT) / NULLIF(COUNT(*),0) AS docx_ratio,
    CAST(SUM(CASE WHEN filename LIKE '%.xlsx' THEN 1 ELSE 0 END) AS FLOAT) / NULLIF(COUNT(*),0) AS xlsx_ratio

FROM file
GROUP BY user;

    """ 
).df()

print(res)

   Count
0    264


In [44]:
table_preview = con.execute("""
SELECT * FROM file_user LIMIT 10 
""").df()

print(table_preview)

      user  total_file_events  unique_pcs_used  unique_files  \
0  DAL0673                352                1           352   
1  CLB0774               7246                1          7246   
2  BDI0533               2458                1          2458   
3  SAA0999               8468                1          8468   
4  MLM0950               2752                1          2752   
5  BJM0111               7841                1          7841   
6  HSB0196              11627                1         11627   
7  ICH0294               2509                1          2509   
8  WXH0872                826                1           826   
9  HWW0436               1245                1          1245   

   unique_file_types  exe_files_accessed  zip_files_accessed  \
0                335                 1.0                21.0   
1               6871                46.0               368.0   
2               2341                14.0               124.0   
3               8041                66.

#### Modify the structure of psychometric table (doesn't need any aggregation as it is already in the req format) 

In [47]:
res = con.execute(
    """
    CREATE TABLE psychometric_user AS
    SELECT
        user_id AS user,
        employee_name,
        O AS openness,
        C AS conscientiousness,
        E AS extraversion,
        A AS agreeableness,
        N AS neuroticism
    FROM psychometric;

    """ 
).df()

print(res)

   Count
0   1000


In [48]:
table_preview = con.execute("""
SELECT * FROM psychometric_user LIMIT 10 
""").df()

print(table_preview)

      user            employee_name  openness  conscientiousness  \
0  CEL0561         Calvin Edan Love        40                 39   
1  CRD0624  Christine Reagan Deleon        26                 22   
2  JFC0557    Jade Felicia Caldwell        22                 16   
3  ASD0577   Aquila Stewart Dejesus        40                 48   
4  MAR0955        Micah Abdul Rojas        36                 44   
5  GRM0868  Gail Rhiannon Mcconnell        21                 25   
6  AAL0706         April Alika Levy        37                 14   
7  RVC0232      Rama Vielka Clayton        34                 20   
8  TCD0009       Tasha Casey Dalton        44                 28   
9  ASM0575     Aurora Sarah Manning        42                 25   

   extraversion  agreeableness  neuroticism  
0            36             19           40  
1            17             39           32  
2            23             40           33  
3            36             14           37  
4            23      

#### Aggregate all the 18 monnths of LDAP tables into one unified LDAP table

In [63]:
res = con.execute(
    """
CREATE TABLE ldap_all AS
SELECT *, '2009-12' AS month FROM "LDAP_2009-12"
UNION ALL
SELECT *, '2010-01' AS month FROM "LDAP_2010-01"
UNION ALL
SELECT *, '2010-02' AS month FROM "LDAP_2010-02"
UNION ALL
SELECT *, '2010-03' AS month FROM "LDAP_2010-03"
UNION ALL
SELECT *, '2010-04' AS month FROM "LDAP_2010-04"
UNION ALL
SELECT *, '2010-05' AS month FROM "LDAP_2010-05"
UNION ALL
SELECT *, '2010-06' AS month FROM "LDAP_2010-06"
UNION ALL
SELECT *, '2010-07' AS month FROM "LDAP_2010-07"
UNION ALL
SELECT *, '2010-08' AS month FROM "LDAP_2010-08"
UNION ALL
SELECT *, '2010-09' AS month FROM "LDAP_2010-09"
UNION ALL
SELECT *, '2010-10' AS month FROM "LDAP_2010-10"
UNION ALL
SELECT *, '2010-11' AS month FROM "LDAP_2010-11"
UNION ALL
SELECT *, '2010-12' AS month FROM "LDAP_2010-12"
UNION ALL
SELECT *, '2011-01' AS month FROM "LDAP_2011-01"
UNION ALL
SELECT *, '2011-02' AS month FROM "LDAP_2011-02"
UNION ALL
SELECT *, '2011-03' AS month FROM "LDAP_2011-03"
UNION ALL
SELECT *, '2011-04' AS month FROM "LDAP_2011-04"
UNION ALL
SELECT *, '2011-05' AS month FROM "LDAP_2011-05";


    """ 
).df()

print(res)

   Count
0  16743


In [64]:
table_preview = con.execute("""
SELECT * FROM ldap_all LIMIT 10 
""").df()

print(table_preview)

             employee_name  user_id                             email  \
0         Calvin Edan Love  CEL0561         Calvin.Edan.Love@dtaa.com   
1  Christine Reagan Deleon  CRD0624  Christine.Reagan.Deleon@dtaa.com   
2    Jade Felicia Caldwell  JFC0557    Jade.Felicia.Caldwell@dtaa.com   
3   Aquila Stewart Dejesus  ASD0577   Aquila.Stewart.Dejesus@dtaa.com   
4        Micah Abdul Rojas  MAR0955        Micah.Abdul.Rojas@dtaa.com   
5  Gail Rhiannon Mcconnell  GRM0868  Gail.Rhiannon.Mcconnell@dtaa.com   
6         April Alika Levy  AAL0706         April.Alika.Levy@dtaa.com   
7      Rama Vielka Clayton  RVC0232      Rama.Vielka.Clayton@dtaa.com   
8       Tasha Casey Dalton  TCD0009       Tasha.Casey.Dalton@dtaa.com   
9     Aurora Sarah Manning  ASM0575     Aurora.Sarah.Manning@dtaa.com   

                      role  business_unit             functional_unit  \
0       ComputerProgrammer              1  2 - ResearchAndEngineering   
1                 Salesman              1       5 

#### Aggregate ldap_all table features grouped by user 

In [65]:
res = con.execute(
    """
    CREATE TABLE ldap_user AS
    SELECT
        user_id AS user,
        MAX(employee_name) AS employee_name,
    
        -- Counts of distinct attributes over 18 months
        COUNT(DISTINCT role) AS unique_roles,
        COUNT(DISTINCT business_unit) AS unique_business_units,
        COUNT(DISTINCT functional_unit) AS unique_functional_units,
        COUNT(DISTINCT department) AS unique_departments,
        COUNT(DISTINCT team) AS unique_teams,
        COUNT(DISTINCT supervisor) AS unique_supervisors,
    
        -- Flags for any changes during 18 months
        CASE WHEN COUNT(DISTINCT role) > 1 THEN 1 ELSE 0 END AS role_changed,
        CASE WHEN COUNT(DISTINCT department) > 1 THEN 1 ELSE 0 END AS department_changed,
        CASE WHEN COUNT(DISTINCT team) > 1 THEN 1 ELSE 0 END AS team_changed
    
    FROM ldap_all
    GROUP BY user_id;
    

    """ 
).df()

print(res)

   Count
0   1000


In [73]:
table_preview = con.execute("""
SELECT * FROM ldap_user LIMIT 10 
""").df()

print(table_preview)

      user           employee_name  unique_roles  unique_business_units  \
0  ASD0577  Aquila Stewart Dejesus             1                      1   
1  MDM0625  Melyssa Deirdre Morgan             1                      1   
2  BER0314  Bianca Elizabeth Roach             1                      1   
3  YCE0983   Yardley Cairo Elliott             1                      1   
4  GHL0460    Geoffrey Harlan Lara             1                      1   
5  OQH0183      Olivia Quynn Hardy             1                      1   
6  KSP0357     Kennedy Shad Parker             1                      1   
7  FEB0306    Flynn Edward Brennan             1                      1   
8  NKH0807   Nicolas Kadeem Hayden             1                      1   
9  LDM0587      Lila Dana Mcintyre             1                      1   

   unique_functional_units  unique_departments  unique_teams  \
0                        1                   1             1   
1                        1                   1

#### Merging all the aggregated tables into one master table for ML model training

In [85]:
res = con.execute(
    """
CREATE TABLE user_features AS
SELECT
    l.user AS user_id,
    l.total_logon_events,
    l.logon_count,
    l.logoff_count,
    l.unique_pcs_used AS logon_unique_pcs,
    l.after_hours_logons,
    l.weekend_logons,
    l.logon_ratio,
    l.after_hours_ratio AS logon_after_hours_ratio,
    l.weekend_ratio AS logon_weekend_ratio,

    d.total_device_events,
    d.connect_count,
    d.disconnect_count,
    d.unique_pcs_used AS device_unique_pcs,
    d.after_hours_connects,
    d.weekend_connects,
    d.connect_ratio,
    d.after_hours_ratio AS device_after_hours_ratio,
    d.weekend_ratio AS device_weekend_ratio,

    h.total_http_events,
    h.unique_pcs_used AS http_unique_pcs,
    h.unique_urls_visited,
    h.after_hours_http,
    h.weekend_http,
    h.after_hours_ratio AS http_after_hours_ratio,
    h.weekend_ratio AS http_weekend_ratio,

    e.total_emails,
    e.unique_pcs_used AS email_unique_pcs,
    e.unique_recipients,
    e.unique_cc,
    e.unique_bcc,
    e.unique_senders,
    e.total_email_size,
    e.avg_email_size,
    e.total_attachments,
    e.avg_attachments,
    e.emails_with_attachments,

    f.total_file_events,
    f.unique_pcs_used AS file_unique_pcs,
    f.unique_files,
    f.unique_file_types,
    f.exe_files_accessed,
    f.zip_files_accessed,
    f.pdf_files_accessed,
    f.docx_files_accessed,
    f.xlsx_files_accessed,
    f.after_hours_file_events,
    f.weekend_file_events,
    f.after_hours_ratio AS file_after_hours_ratio,
    f.weekend_ratio AS file_weekend_ratio,
    f.exe_ratio,
    f.zip_ratio,
    f.pdf_ratio,
    f.docx_ratio,
    f.xlsx_ratio,

    p.employee_name AS psychometric_employee_name,
    p.openness,
    p.conscientiousness,
    p.extraversion,
    p.agreeableness,
    p.neuroticism,

    ldap.employee_name AS ldap_employee_name,
    ldap.unique_roles,
    ldap.unique_business_units,
    ldap.unique_functional_units,
    ldap.unique_departments,
    ldap.unique_teams,
    ldap.unique_supervisors,
    ldap.role_changed,
    ldap.department_changed,
    ldap.team_changed

FROM logon_user l
LEFT JOIN device_user d ON l.user = d.user
LEFT JOIN http_user h ON l.user = h.user
LEFT JOIN email_user e ON l.user = e.user
LEFT JOIN file_user f ON l.user = f.user
LEFT JOIN psychometric_user p ON l.user = p.user
LEFT JOIN ldap_user ldap ON l.user = ldap.user;


    """ 
).df()

print(res)

   Count
0   1000


In [90]:
table_preview = con.execute("""
SELECT * FROM user_features LIMIT 5
""").df()

print(table_preview.columns)

Index(['user_id', 'total_logon_events', 'logon_count', 'logoff_count',
       'logon_unique_pcs', 'after_hours_logons', 'weekend_logons',
       'logon_ratio', 'logon_after_hours_ratio', 'logon_weekend_ratio',
       'total_device_events', 'connect_count', 'disconnect_count',
       'device_unique_pcs', 'after_hours_connects', 'weekend_connects',
       'connect_ratio', 'device_after_hours_ratio', 'device_weekend_ratio',
       'total_http_events', 'http_unique_pcs', 'unique_urls_visited',
       'after_hours_http', 'weekend_http', 'http_after_hours_ratio',
       'http_weekend_ratio', 'total_emails', 'email_unique_pcs',
       'unique_recipients', 'unique_cc', 'unique_bcc', 'unique_senders',
       'total_email_size', 'avg_email_size', 'total_attachments',
       'avg_attachments', 'emails_with_attachments', 'total_file_events',
       'file_unique_pcs', 'unique_files', 'unique_file_types',
       'exe_files_accessed', 'zip_files_accessed', 'pdf_files_accessed',
       'docx_files_acc