In [1]:
import pandas as pd
import numpy as np

# Terror data

In [48]:
terror_encoding = 'ISO-8859-1'
df_terror = pd.read_csv('src/data/globalterrorism_2020.csv', encoding=terror_encoding, low_memory=False)

In [49]:
def clean_date(df):
    df[df['imonth'] == 0] = None
    df[df['iday'] == 0] = None
    df_cleaned = df.drop(columns=['approxdate', 'extended', 'resolution'])
    return df_cleaned

print(df_terror.shape)
df_terror = clean_date(df_terror)
print(df_terror.shape)

(209706, 135)
(209706, 132)


In [50]:
def clean_incident_information(df):
    df_cleaned = df.drop(columns=['doubtterr', 'alternative', 'alternative_txt'])

    # check all criterias are met
    criterion = lambda a, b, c: df_cleaned[(df_cleaned['crit1'] == a) & (df_cleaned['crit2'] == b) & (df_cleaned['crit3'] == c)].shape[0]
    assert (criterion(0, 0, 0) == 0 
            & criterion(1, 0, 0) == 0 
            & criterion(0, 1, 0) == 0 
            & criterion(0, 0, 1) == 0), 'At least 2 criterias must be satisfied'

    return df_cleaned

print(df_terror.shape)
df_terror = clean_incident_information(df_terror)
print(df_terror.shape)

(209706, 132)
(209706, 129)


In [51]:
def clean_incident_location(df):
    # remove all attacks without/invalid location
    df = df.query('-90 <= latitude <= 90 & -180 <= longitude <= 180')

    # keep all attacks without location but set invalid location to None
    #df[(df['latitude'] < -90) | (df['latitude'] > 90)] = None
    #df[(df['longitude'] < -180) | (df['longitude'] > 180)] = None

    df_cleaned = df.drop(columns=['provstate', 'city', 'vicinity', 'location', 'specificity'])
    return df_cleaned

print(df_terror.shape)
df_terror = clean_incident_location(df_terror)
print(df_terror.shape)

(209706, 129)
(204210, 124)


In [52]:
def clean_attack_information(df):
    df_cleaned = df.drop(columns=['attacktype2', 'attacktype2_txt', 'attacktype3', 'attacktype3_txt'])
    return df_cleaned

print(df_terror.shape)
df_terror = clean_attack_information(df_terror)
print(df_terror.shape)

(204210, 124)
(204210, 120)


In [53]:
def clean_weapon_information(df):
    # drop non-primary weapon information
    df_cleaned = df.drop(columns=['weaptype2', 'weaptype2_txt', 'weapsubtype2', 'weapsubtype2_txt',
                                  'weaptype3', 'weaptype3_txt', 'weapsubtype3', 'weapsubtype3_txt',
                                  'weaptype4', 'weaptype4_txt', 'weapsubtype4', 'weapsubtype4_txt'])
    
    # keep weapon details?
    #df_cleaned = df_cleaned.drop(columns=['weapdetail'])
    return df_cleaned

print(df_terror.shape)
df_terror = clean_weapon_information(df_terror)
print(df_terror.shape)

(204210, 120)
(204210, 108)


In [None]:
# mangler
def clean_target_victim_information(df):
    df_cleaned = df
    return df_cleaned

print(df_terror.shape)
df_terror = clean_target_victim_information(df_terror)
print(df_terror.shape)

In [None]:
# mangler
def clean_perpetrator_information(df):
    df_cleaned = df
    return df_cleaned

print(df_terror.shape)
df_terror = clean_perpetrator_information(df_terror)
print(df_terror.shape)

In [None]:
# mangler
def clean_casualities_and_consequences(df):
    df_cleaned = df
    return df_cleaned

print(df_terror.shape)
df_terror = clean_casualities_and_consequences(df_terror)
print(df_terror.shape)

In [None]:
# mangler
def clean_additional_information_and_sources(df):
    df_cleaned = df
    return df_cleaned

print(df_terror.shape)
df_terror = clean_additional_information_and_sources(df_terror)
print(df_terror.shape)

In [None]:
df_terror.to_csv('src/data/globalterrorism_2020_cleaned.csv', encoding=terror_encoding)

# Country data