In [0]:
#install required pandas SQL library
pip install pandasql

In [0]:
import glob #library to walk directory paths
import re #Regex library
import os    #library to join path names

In [0]:
leaker_path =  './drive/My Drive/DatosInsider/answers/r4.2-1'
thief_path = './drive/My Drive/DatosInsider/answers/r4.2-2'
sabotage_path = './drive/My Drive/DatosInsider/answers/r4.2-3'
act_path = './drive/My Drive/DatosInsider/r4.2/' #with activity info
dwh_path ='./drive/My Drive/DatosInsider/DWH_tables/' #final repository path

In [0]:
#Get all domains used by the identified insiders

def get_insider_domains(path):
  all_files = glob.glob(path + "/*.csv")
  all_insider_domains=set([])
  #read all files in the path and separate into two kind of DF, one for http
  #and one for email  
  for filename in all_files:
    insider_file = open(filename, "r")    
    for line in insider_file:
      fields=line.split(",")   
      if fields[0]=="http":                  
        all_insider_domains.add(re.findall('^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)',fields[5])[0])            
      elif fields[0]== "email":
        for idx in range(5,9): 
          if fields[idx] != '':    
            emails  =fields[idx].split(';')
            for email in emails:
                all_insider_domains.add(email.split('@')[1])           
                #all_insider_domains.add(fields[idx].split('@')[1])              
    insider_file.close()
    try:
      all_insider_domains.remove('dtaa.com')  #Domain of the same fictional company
    except:
      continue
  return all_insider_domains

#Leaker files contain URL that insider used to shared information
all_leaker_domains=get_insider_domains(leaker_path)

#Thief files contain the URL that the employees surfed looking for employment
# from a job websit or from the competitors
# also emails sent to the competitors
all_thief_domains=get_insider_domains(thief_path)

#sabotage files include logs to malware or keylogger
all_sabotage_domains=get_insider_domains(sabotage_path)

In [0]:
#Get all domains for all users

#http.csv
#* Fields: id, date, user, pc, url, content
http_file = open(os.path.join(act_path,'/http.csv'), "r")
all_users_domains=set([])
for line in http_file:
  fields=line.split(",")
  all_users_domains.add(re.findall('^(?:https?:\/\/)?(?:[^@\/\n]+@)?(?:www\.)?([^:\/?\n]+)',fields[4])[0])      
http_file.close()    

#email.csv
#* Fields: id, date, user, pc, to, cc, bcc, from, size, attachment_count, content
email_file = open(os.path.join(act_path,'email.csv'), "r")
for line in email_file:
  fields=line.split(",")
  for idx in range(4,8): 
    try:               
      emails  =fields[idx].split(';')
      for email in emails:
        all_users_domains.add(email.split('@')[1])
    except:
      continue

email_file.close()   

#Remove incorrect domain names
try:
  all_users_domains.remove('dtaa.com') #Domain of the same fictional company
  all_users_domains.remove(' ') 
  all_users_domains.remove('') 
except:
  print("Error")

Error


In [0]:
import pandas as pd
from pandasql import sqldf

def add_domain_info(domain_df, bl_domains_df):  
  results=pd.merge(domain_df, bl_domains_df,left_on='domain', right_on='name', how='left')
  results.drop_duplicates('domain', keep='first',inplace=True)
  miss_domain=results[results.isnull().any(axis=1)]
  results=results.dropna()
  results.reset_index(drop=True, inplace=True)
  q = """select ms.domain, bl.* from miss_domain ms left join bl_domains_df bl on bl.name like '%.' || ms.domain || '%' ;"""
  #pysqldf = lambda q: sqldf(q, locals())
  #df=pysqldf(q)
  df=sqldf(q, locals())
  df.drop_duplicates('domain', keep='first',inplace=True)
  results=results.append(df, ignore_index=True)
  return results

#Load Black list domain from file
bl_domains=pd.read_csv(os.path.join(dwh_path,'BL_domains.csv'))

all_leaker_domains= pd.DataFrame(all_leaker_domains, columns=['domain'])
all_leaker_domains= add_domain_info(all_leaker_domains,bl_domains)
all_leaker_domains.to_csv(os.path.join(dwh_path,'all_leaker_domains_classified.csv'), index=False)

all_thief_domains= pd.DataFrame(all_thief_domains, columns=['domain'])
all_thief_domains= add_domain_info(all_thief_domains,bl_domains)
all_thief_domains.to_csv(os.path.join(dwh_path,'/all_thief_domains_classified.csv'), index=False)

all_sabotage_domains= pd.DataFrame(all_sabotage_domains, columns=['domain'])
all_sabotage_domains=add_domain_info(all_sabotage_domains,bl_domains)
all_sabotage_domains.to_csv(os.path.join(dwh_path,'/all_sabotage_domains_classified.csv'), index=False)

all_users_domains= pd.DataFrame(all_users_domains, columns=['domain'])
all_users_domains= add_domain_info(all_users_domains,bl_domains)
#all_users_domains.to_csv('./drive/My Drive/DatosInsider/DWH_tables/all_users_domains_classified.csv', index=False)

#Final catalogue was manually updated by adding an extra column 'Competitor' were it was inferred from the data
# that planes and technology websites are competitors