# Administering and Securing Organizations


## Identifying Patterns and Anomalies

### Setup the Environment

In [None]:
import warnings
warnings.simplefilter('ignore')

In [None]:
import os
import datetime
import tempfile
import pandas as pd
import numpy as np
import arcgis
from arcgis.gis import GIS
import seaborn as sns
import matplotlib.pyplot as plt
from arcgis._impl.common._utils import chunks

### Access the Organization

In [None]:
gis = GIS(profile='your_online_profile')

### Accessing Event Feature Layer as SeDF

In [None]:
item = gis.content.search("Threat Detection Data", item_type="Feature Layer")[0]
item

##### Acquire the Last Date in SeDF

In [None]:
clean_up = True
lyrs = item.layers[0]

In [None]:
try:
    sdf_storage = lyrs.query(as_df=True)
except:
    sdf_storage = None

In [None]:
import copy
end_date_orig = datetime.datetime.now()
end_date = copy.deepcopy(end_date_orig)
try:
    last_date = sdf_storage.created.max()
    start_date_orig = copy.deepcopy(last_date.to_pydatetime())
    start_date = last_date.to_pydatetime()
except:
    start_date_orig = end_date_orig - datetime.timedelta(days=200)
    start_date= end_date_orig - datetime.timedelta(days=200)

##### Download CSV Data for Each Day 

- Downloads the data for each day not in the Feature Layer

In [None]:
data = []
i = 1
days = (end_date_orig - start_date_orig).days
print(days)
if days == 0:
    days = 1
end_date = copy.deepcopy(end_date_orig)
while i <= days:
    start_date = end_date_orig - datetime.timedelta(days=i)
    
    fp = gis.admin.history(start_date, to_date=end_date, all_events=True, data_format='csv')
    end_date = start_date
    data.append(
        pd.read_csv(fp, infer_datetime_format=True, parse_dates=True, dtype={'ip' : str}))
    os.remove(fp)
    i += 1

##### Merge and Clean up Data

- Perform simple data cleanups to ensure data is readable and useful

In [None]:
if len(data) == 1:
    data = data[0]
else:
    data = pd.concat(data)
data.created = pd.to_datetime(data.created, unit='ms')
data.columns = [c.lower() if c not in ['SHAPE', 'OBJECTID'] \
                else c for c in data.columns]
criteria = [data['created'].dt.hour.between(0,1), # "LATE NIGHT"
            data['created'].dt.hour.between(2,5), # "OVERNIGHT"
            data['created'].dt.hour.between(6,9), # "MORNING"
            data['created'].dt.hour.between(10,13), # "MID-DAY"
            data['created'].dt.hour.between(14,17), # "AFTERNOON"
            data['created'].dt.hour.between(18,21), # "EVENING"
            data['created'].dt.hour.between(22,24)] # "LATE NIGHT"
blocks = ["LATE NIGHT", "OVERNIGHT", "MORNING", "MID-DAY", 
          "AFTERNOON", "EVENING", "LATE NIGHT"]
data['time_block'] = np.select(criteria, blocks, 0)
data['day_of_week'] = data.created.dt.weekday_name
# Backup Data
path = r"./data"
backup_file = datetime.datetime.now().strftime('%Y%m%d_%H_%M_history.csv')
data.to_csv(f"{path}/{backup_file}")

In [None]:
start_date = start_date_orig
df_ips_only = data[data.ip.str.strip().str.len() > 0].copy().reset_index(drop=True)
cols_show = [c for c in df_ips_only.columns if c !='ip']

In [None]:
df_ips_only[cols_show].head()

## GeoLocating IP Addresses

#### Geocoding IPs

- leverage `geoip2` python library

In [None]:
import geoip2
from geoip2 import database
reader = database.Reader(r"./spatial_data/GeoLite2-City.mmdb")

def geocode_ip(ips, reader):
    """Builds a reader using geoip2 and the GeoList2-City db to locate IPs"""
    for ip in ips:
        try:
            res = reader.city(ip)
        
            yield (ip, res.continent.name, res.country.name, 
                   res.country.iso_code, res.location.latitude,
                   res.location.longitude, res.registered_country.iso_code, 
                   res.postal.code)
        except:
            yield (ip, None, None, None, None, None, None, None)

### Geocode

In [None]:
ips = df_ips_only.ip.unique()
columns = ['ip', 'continent', "country", "isocode", "lat", "long", "reg_cntry", "postal"]
records = [rec for rec in geocode_ip(ips, reader)]
df_ips = pd.DataFrame(records, columns=columns)
df_ips[df_ips.columns.tolist()[1:]].head()

### GeoSpatial Distribution of IP Addresses

In [None]:
sdf = pd.DataFrame.spatial.from_xy(df_ips, x_column="long", y_column="lat")
sdf.spatial.plot()

## Data Wrangling

Massage the data into a format that can be exported to a fgdb and uploaded to the portal.

#### Combine the Records to the GeoLocations 

Use `pd.merge` to combine the two dataframes into a single dataset

In [None]:
df_ips.ip = df_ips.ip.str.strip()

In [None]:
df_j = pd.merge(df_ips_only,
                sdf,
                left_on='ip',
                right_on='ip',
                how='left').reset_index(drop=True)

#### Clean up field names

In [None]:
import numpy as np
df_j['OBJECTID'] = np.arange(1, len(df_j)+ 1)

In [None]:
df_j.head()

#### Examine Event Time Blocks

In [None]:
df_j['time_block'].value_counts().plot('bar')

#### Examine the Spatial Distribution of the Portal's Users

In [None]:
sns.set(font_scale=1.4)
(df_j['country']
 .value_counts()
 .plot("bar", cmap='viridis', alpha=0.75, rot=45))

## Detect Outliers

The goal is to find potential odd or uncommon locations that do not reflect common patterns within our data.

### Reduce the Dimensions

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn import decomposition

In [None]:
if not sdf_storage is None:
    sdf_merged = pd.concat([sdf_storage, df_j], sort=False)
else:
    sdf_storage = pd.DataFrame([], columns=df_j.columns)
    sdf_merged = df_j.copy()

In [None]:
lb_make = LabelEncoder()
sdf_merged['encoded_action'] = lb_make.fit_transform(sdf_merged.action)
sdf_merged['encoded_day_of_week'] = lb_make.fit_transform(sdf_merged.day_of_week)
sdf_merged['encoded_country'] = lb_make.fit_transform(np.where(sdf_merged.country.isnull(), 
                                                               "", 
                                                               sdf_merged.country.values))

pca = decomposition.PCA(n_components=2)
data = sdf_merged[['lat', 'long',  'encoded_day_of_week', 'encoded_country']].values#'encoded_action',
pca.fit(data)
reduce_n2 = pca.transform(data)

### Identify the Outliers

##### Use Isolation Forests

Isolation Forest is an outlier detection technique that identifies anomalies instead of normal observations. 

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test = train_test_split(reduce_n2)
clf = IsolationForest(max_samples='auto',
                      behaviour="new", 
                      contamination=.01)
y_pred_train = clf.predict(X_train)
y_pred_outliers = clf.predict(reduce_n2)
sdf_merged['outliers_new'] = y_pred_outliers

In [None]:
sdf_merged.outliers_new.value_counts()

### Prepare Data for Feature Layer

In [None]:
q_updates = ~(sdf_merged[:len(sdf_storage)].copy().outliers.fillna(0).astype(int) == \
              sdf_merged[:len(sdf_storage)].copy().outliers_new.astype(int))
sdf_merged['outliers'] = sdf_merged.outliers_new

In [None]:
sdf_merged = sdf_merged.drop(columns=['outliers_new', 'encoded_day_of_week', 'encoded_action'])

### Load Data to Feature Layer

In [None]:
if not sdf_storage is None and len(sdf_storage) > 0:
    fs_updates = [f for f in sdf_merged[:len(sdf_storage)][q_updates].copy().spatial.to_featureset()]
    fs_new = [f for f in sdf_merged[len(sdf_storage):].copy().spatial.to_featureset()]
    for idx, f in enumerate(fs_new):
        f.as_dict['attributes'].pop("OBJECTID")
        fs_new[idx] = f
else:
    fs_updates = []
    fs_new = [f.as_dict for f in sdf_merged.copy().spatial.to_featureset()]
    for idx, f in enumerate(fs_new):
        f['attributes'].pop("OBJECTID")
        fs_new[idx] = f    

### Send Updates to Layer

In [None]:
resp = {}
i = 0
for chunk in chunks(fs_updates, 500):
    response = lyrs.edit_features(updates=chunk)
    resp[i] = response
    i += 1

### Add New Records

In [None]:
resp_add = {}
i = 0
for chunk in chunks(fs_new, 500):
    response = lyrs.edit_features(adds=fs_new)
    resp_add[i] = response
    i += 1