In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

  from collections import Sequence


### 1. Importing, Cleaning and General Overview:

In [2]:
df = pd.read_excel('globalterrorismdb_0718dist.xlsx', 
                   usecols = 'A:D, F, H:M, S:W, AA:AD, AI:AN, AP, BG, BM:BN, BQ:BR, CD:CG, DA',
                   dtype = {'summary':str, 'motive':str})                                                                                         

In [3]:
# Renaming for usability:

df = pd.DataFrame(df.rename({'eventid':'event_id', 'iyear':'year', 'imonth':'month',
             'iday':'day', 'extended':'24+hrs', 'provstate':'region_2', 'doubtterr':'doubt', 
             'attacktype1':'attack_1', 'attacktype1_txt':'attack_1txt','attacktype2':'attack_2', 
             'targtype1':'target_1', 'targtype1_txt':'target_1txt', 'targsubtype1':'sub_target',
             'targsubtype1_txt':'sub_targettxt', 'target1':'specific_target',
             'natlty1':'victim_nationality', 'natlty1_txt':'victim_nationalitytxt', 'gname':'group_name', 
             'guncertain1':'group_attrib_crtainty', 'individual':'unaffil_individ', 'weaptype1':'weapon', 
             'weaptype1_txt':'weapontxt', 'weapsubtype1':'sub_weapon', 'weapsubtype1_txt':'sub_weapontxt'}, axis = 1))

In [4]:
# Cleaning up some of the null values in our object-type columns:
# (not dropping them but setting them to 'No Data'):

df['motive'] = df['motive'].where(df['motive'].apply(lambda x: x != 'nan'), other = 'No Data')
df['summary'] = df['summary'].where(df['summary'].apply(lambda x: x != 'nan'), other = 'No Data')

### 2. Preliminary Exploration and Visualization:

##### A. Isolating relevant data - Regex:

> Here, we want to try to isolate the rows whose summaries relate directly to what we want to analyze:

In [25]:
# Creating the dyber_df Dataframe:
# Regex to isolate qualifying data (This set is more flexible in its Regex):
import re
import gc

print(df.summary.str.contains('cyb+|social|inform+|network+|twit+|facebook|fake|electio+|internet|ads|adv+|priv+|hac+|breac+|emai+',
                         flags = re.IGNORECASE, regex = True).value_counts())

False    172205
True       9486
Name: summary, dtype: int64


In [26]:
# Creating a mask which represents the results from our search above:

mask = df.summary.str.contains(r'(cyb+|social|inform+|network+|twit+|facebook|fake|electio+|internet|ads|adv+|priv+|hac+|breac+|emai+)', 
                           flags = re.IGNORECASE)

# Creating a new, filtered dataframe with this specific information:

cyber_df = pd.DataFrame(df[mask])

### 3. Wranglin':

##### Dropping inconsequential NA Values:

In [27]:
# Since making dummies of the entire dataframe didn't quite work, looking at doing some customized dummy columns
# and then using the get_dummies method for the rest:

# Running into NA issues, so dropping na where na values are under 50:

cyber_df = pd.DataFrame(cyber_df.dropna(subset = ['city', 'group_attrib_crtainty',
                                                        'specific_target']))

##### Wrangling the more considerable nulls:

In [28]:
# Taking out the NA the mean of the remaining values and 
# setting the values we did not consider with the mean of the others: 

# sub_target:

sub_target_mean = cyber_df['sub_target'].mean(skipna = True)

cyber_df['sub_target'] = cyber_df['sub_target'].fillna(sub_target_mean, inplace = True)

# sub_targettxt:
# Following the same pattern as above:

cyber_df['sub_targettxt'] = cyber_df['sub_targettxt'].fillna('No Data')

# victim_nationality:

cyber_df['victim_nationality'] = cyber_df['victim_nationalitytxt'].fillna('No Data')

# sub_weapon and sub_weapontxt:
# Filling in the NA with the mean:

sub_weapon_mean = cyber_df['sub_weapon'].mean(skipna = True)

cyber_df['sub_weapon'] = cyber_df['sub_weapon'].fillna(sub_weapon_mean, inplace = True)

# Filling the NA with 'No Data'

cyber_df['sub_weapontxt'] = cyber_df['sub_weapontxt'].fillna('No Data')

#### Manual Dummies:

> Still a lot of unique values - will try dropping the ones with unmanageable numbers for our model. Will try grouping with the others

##### weapon and weapon_txt:

In [29]:
# Weapon:
# Grouping related values together after looking at value-counts:

# Grouping Chemical and Biological Weapons together:
cyber_df['weapon'] = cyber_df['weapon'].replace(1, 2)

# Putting Fake and 'Other' types of weapons into the same category:
cyber_df['weapon'] = cyber_df['weapon'].replace(12, 7)

# Putting vehicles into category 11 (sabotage equipment):
cyber_df['weapon'] = cyber_df['weapon'].replace(10, 11)

In [30]:
# Weapon_txt:

# Grouping Biological and Chemical Weapons:
cyber_df['weapontxt'] = cyber_df['weapontxt'].replace('Chemical', 'Bio-Chem')
cyber_df['weapontxt'] = cyber_df['weapontxt'].replace('Biological', 'Bio-Chem')

# Grouping Fake and Other together:
cyber_df['weapontxt'] = cyber_df['weapontxt'].replace('Fake Weapons', 'Fake/Other')
cyber_df['weapontxt'] = cyber_df['weapontxt'].replace('Other', 'Fake/Other')

# Aggregating Vehicle into Sabotage Equipment:
cyber_df['weapontxt'] = cyber_df['weapontxt'].replace(
    'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)', 'Sabotage Equipment')


##### sub_target and sub_targettxt

In [31]:
# Sub_target:

# I didn't want to isolate all of this, else it would reflect the target column exactly - 
# but made some sub-groupings:

industrial = [1, 5, 9, 10, 12]
white_collar = [2, 3, 4, 6, 7, 8, 11, 13, 112]
gov_figure1 = [14, 18]
gov_figure2 = [15, 16, 17, 19]
police_facilities = [22, 23, 24, 26]
mil_facilities = [27, 28, 30, 31, 32, 35, 36, 39]
mil_personnel = [29, 33, 34]
pilots_planes = [42, 43]
gov_diplomatic = [45, 46, 47]
educational = [48, 49, 50]
food_water = [51, 52]
media = [53, 54, 55, 56]
religious = [69, 85, 86, 87]
political = [83, 84, 109, 110, 111]
mass_socio = [65, 68, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105]
util_telecom = [88, 89, 90, 91, 92, 106, 107, 108]

# Sub_target Assignment:

cyber_df['sub_target'].replace({'sub_target': {'industrial':1, 'white_collar':2, 
                                       'gov_figure1':14,'gov_figure2':15, 
                                       'police_facilities':22, 'mil_facilities':27,
                                       'mil_personnel':29, 'pilots_planes':42, 
                                       'gov_diplomatic':45, 'educational': 48, 
                                       'food_water':51, 'media':53, 'religious':69,
                                       'political':83, 'mass_socio': 65,
                                       'util_telecom': 88}}, inplace = True)

In [33]:
# Sub_targettxt:

industrial = ['Gas/Oil/Electric', 'Industrial/Textiles/Factory', 'Farm/Ranch', 'Mining', 'Construction']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in industrial), other = 'Industrial')


white_collar = ['Restaurant/Bar/Café', 'Bank/Commerce', 'Multinational Corporation', 
                'Medical/Pharmaceutical', 'Retail/Grocery/Bakery (including cell phone shops and generic shops)', 
                'Hotel/Resort', 'Entertainment/Cultural/Stadium/Casino', 'Private Security Company/Firm',
                'Legal Services']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in white_collar), other = 'White_Collar')


gov_figure1 = ['Judges/Attorneys/Courts', 'Government Personnel (excluding police, military)']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in gov_figure1), other = 'Gov_Figure1')

gov_figure2 = ['Politician or Political Party Movement/Meeting/Rally', 'Royalty', 'Head of State',
               'Election-Related']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in gov_figure2), other = 'Gov_Figure2')


police_facilities = ['Police Buildings (Headquarters/Stations/School)', 
                     'Police Patrol (including vehicles and convoys)', 
                     'Police Checkpoint', 'Prison/Jail']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in gov_figure2), 
                                                other = 'Police_Facilities')

military_facilities = ['Military Barracks/Base/Headquarters/Checkpost', 'Military Recruiting Station/Academy',
                       'Military Weaponry', 'Military Aircraft', 'Military Maritime', 
                       'Military Transportation/Vehicle (excluding convoys)', 'Military Checkpoint',
                       'Paramilitary']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in military_facilities), 
                                                other = 'Military_Facilities')


military_personnel = ['Military Unit/Patrol/Convoy', 'Non-combatant Personnel', 
                      'Military Personnel (soldiers, troops, officers, forces)']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in military_personnel), 
                                                other = 'Military_Personnel')


pilots_aircraft = ['Aircraft (not at an airport)', 'Airline Officer/Personnel']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in military_personnel), 
                                                other = 'Pilots_Aircraft')

gov_diplomatic = [' Diplomatic Personnel (outside of embassy, consulate)', 'Embassy/Consulate', 
                  'International Organization (peacekeeper, aid agency, compound)']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in gov_diplomatic), 
                                                other = 'Gov_Diplomatic')

educational = ['Teacher/Professor/Instructor', 'School/University/Educational Building', 
               'Other Personnel']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in educational), 
                                                other = 'Educational')

food_water = ['Food Supply', 'Water Supply']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in food_water), 
                                                other = 'Food_Water')

media = ['Newspaper Journalist/Staff/Facility', 'Radio Journalist/Staff/Facility', 
         'Television Journalist/Staff/Facility', 'Other (including online news agencies)']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in media), 
                                                other = 'Media')


religious = ['Religion Identified', 'Religious Figure', 'Place of Worship', 
             'Affiliated Institution']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in military_personnel), 
                                                other = 'Pilots_Aircraft')


political = ['Protester', 'Political Party Member/Rally', 'Party Official/Candidate/Other Personnel', 
             'Party Office/Facility', 'Rally']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in political), 
                                                other = 'Political')

mass_socio = ['Refugee (including Camps/IDP/Asylum Seekers)', 'Named Civilian', 'Student', 
              'Race/Ethnicity Identified', 'Farmer', 'Vehicles/Transportation', 'Marketplace/Plaza/Square', 'Village/City/Town/Suburb', 
              'House/Apartment/Residence', 'Laborer (General)/Occupation Identified', 'Procession/Gathering (funeral, wedding, birthday, religious)', 
              'Public Areas (e.g., Public garden, parking lot, garage, beach, public buildings, camps)',
             'Memorial/Cemetery/Monument', 'Museum/Cultural Center/Cultural House', 'Labor Union Related', 
              'Tourism Travel Agency', 'Tour Bus/Van/Vehicle', 'Tourist', 'Other Facility', 'Train/Train Tracks/ Trolley', 'Bus Station/Stop', 'Subway', 
              'Bridge/Car Tunnel', 'Highway/Road/Toll/Traffic Signal', 'Taxi/Rickshaw']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in mass_socio), 
                                                other = 'Mass_Social')



util_telecom = ['Radio', 'Television', 'Telephone/Telegraph', 'Internet Infrastructure', 
                'Multiple Telecommunication Targets', 'Gas', 'Electricity', 'Oil']
df['sub_targettxt'] = df['sub_targettxt'].where(df['sub_targettxt'].apply(lambda x: x not in util_telecom), 
                                                other = 'Util_Telecom')

In [38]:
# Trying Group_name a bit and grouping them by Ideaology.
# Separatists, Leftists and Rightists:

# Palestinian Separatists:
cyber_df['group_name'] = cyber_df['group_name'].replace('Hamas (Islamic Resistance Movement)', 
                                                        'Palestinian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Palestinian Islamic Jihad (PIJ)', 
                                                        'Palestinian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Popular Front for the Liberation of Palestine (PFLP)',
                                                        'Palestinian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Popular Resistance Committees',
                                                        'Palestinian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Fatah',
                                                        'Palestinian_Separatists')

# Militants:
cyber_df['group_name'] = cyber_df['group_name'].replace('Militants',
                                                        'Militant_Gunmen_Groups')
cyber_df['group_name'] = cyber_df['group_name'].replace('Gunmen',
                                                        'Militant_Gunmen_Groups')

In [39]:
# Asian Separatists:

cyber_df['group_name'] = cyber_df['group_name'].replace('Abu Sayyaf Group (ASG)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Colonel Karuna Faction',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Eastern Turkistan Islamic Movement (ETIM)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Free Aceh Movement (GAM)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Janatantrik Terai Mukti Morcha (Jtmm)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Janatantrik Terai Mukti Morcha- Goit (Jtmm-G)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Janatantrik Terai Mukti Morcha- Jwala Singh (Jtmm-J)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Janatantrik Terai Mukti Morcha- Rajan Mukti (Jtmm-R)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Liberation Tigers of Tamil Eelam (LTTE)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Moro Islamic Liberation Front (MILF)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Runda Kumpulan Kecil (Rkk)',
                                                        'Asian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Terai Army',
                                                        'Asian_Separatists')


In [54]:
# Middle Eastern Separatists
cyber_df['group_name'] = cyber_df['group_name'].replace('Supreme Council For Islamic Revolution In Iraq (Sciri)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Southern Mobility Movement (Yemen)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Riyadus-Salikhin Reconnaissance And Sabotage Battalion Of Chechen Martyrs',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Haqqani Network',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Harkatul Jihad-E-Islami',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Hizbul Mujahideen (Hm)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Jaish-E-Mohammad (Jem)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Kurdistan Free Life Party',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Kurdistan Workers Party (PKK)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Lashkar-E-Balochistan',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Lashkar-E-Taiba (Let)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Ansar Al-Islam',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Ansar Al-Sharia (Libya)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Baloch Liberation Army (BLA)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Baloch Liberation Front (Blf)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Baloch Republican Army (BRA)',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Caucasus Emirate',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Free Syrian Army',
                                                        'Middle_Eastern_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Chechen Rebels ',
                                                        'Middle_Eastern_Separatists')

In [55]:
# Indian Separatists
cyber_df['group_name'] = cyber_df['group_name'].replace('Black Widows',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Dima Halao Daoga (Dhd)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Garo National Liberation Army',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Kangleipak Communist Party (KCP)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Karbi Longri North Cachar Liberation Front (Klnlf)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('National Democratic Front of Bodoland (NDFB)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('National Liberation Front of Tripura (NLFT)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('National Socialist Council of Nagaland-Isak-Muivah (NSCN-IM)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('People\'s Liberation Army (PLA)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('People\'s Revolutionary Party of Kangleipak (PREPAK)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('United Liberation Front of Assam (ULFA)',
                                                        'Indian_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('United National Liberation Front (UNLF)',
                                                        'Indian_Separatists')

In [56]:
# Irish Separatists
cyber_df['group_name'] = cyber_df['group_name'].replace('Real Irish Republican Army (RIRA)',
                                                        'Irish_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Oglaigh Na Heireann',
                                                        'Irish_Separatists')
cyber_df['group_name'] = cyber_df['group_name'].replace('Irish Republican Army (IRA)',
                                                        'rish_Separatists')


In [57]:
# Farc left and right
cyber_df['group_name'] = cyber_df['group_name'].replace('National Liberation Army of Colombia (ELN)',
                                                        'FARC_Leftist')
cyber_df['group_name'] = cyber_df['group_name'].replace('Popular Liberation Army (EPL)',
                                                        'FARC_Leftist')
cyber_df['group_name'] = cyber_df['group_name'].replace('Revolutionary Armed Forces of Colombia (FARC)',
                                                        'FARC_Leftist')

cyber_df['group_name'] = cyber_df['group_name'].replace('United Self Defense Units of Colombia (AUC)',
                                                        'FARC_Rightist')
  

In [58]:
# Middle Eastern Religious:
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Gama\'at Al-Islamiyya (IG)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Nusrah Front',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Qa\'ida',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Qa\'ida in the Arabian Peninsula (AQAP)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Qa\'ida in the Lands of the Islamic Maghreb (AQLIM)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Al-Shabaab',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Ansar Al-Islam',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Ansar Al-Sharia (Libya)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Asa\'Ib Ahl Al-Haqq',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Caucasus Emirate',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Eritrean Islamic Jihad Movement (EIJM)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Great Eastern Islamic Raiders Front (Ibda-C)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Hizbul Al Islam (Somalia)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Islamic Courts Union (ICU)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Islamic State of Iraq and al Sham (ISIS)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Islamic Movement of Uzbekistan (IMU)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Jamiat Ul-Mujahedin (Jum)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Jundallah',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Mahdi Army',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Taliban',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Tehrik-i-Taliban Pakistan (TTP)',
                                                        'Middle_Eastern_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Muslim extremists ',
                                                        'Middle_Eastern_Religious')


In [59]:
# Asian and African Religious:

cyber_df['group_name'] = cyber_df['group_name'].replace('Students Islamic Movement of India (Simi)',
                                                        'Asian/Asian-Pacific_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Ranbir Sena',
                                                        'Asian/Asian-Pacific_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Jemaah Islamiya (JI)',
                                                        'Asian/Asian-Pacific_Religious')


cyber_df['group_name'] = cyber_df['group_name'].replace('Movement for Oneness and Jihad in West Africa (MUJAO)',
                                                        'African_Religious')
cyber_df['group_name'] = cyber_df['group_name'].replace('Lord\'s Resistance Army (LRA)',
                                                        'African_Religious')


In [60]:
cyber_df['group_name'].value_counts()

Unknown                                                                       5266
Middle_Eastern_Religious                                                       948
Communist Party of India - Maoist (CPI-Maoist)                                 400
Maoists                                                                        231
Islamic State of Iraq and the Levant (ISIL)                                    190
Indian_Separatists                                                             166
FARC_Leftist                                                                   125
Middle_Eastern_Separatists                                                     114
New People's Army (NPA)                                                        111
Boko Haram                                                                      88
Asian_Separatists                                                               86
Al-Qaida in Iraq                                                                79
Kurd

In [None]:
# Trying City Names by Religion or Tribal Associations:
sunni_cities = ['Mosul', 'Kirkuk', 'Sanandaj', 'Ramadi', 'Trabzone', 'Diarbekir', 
                'Damascus', 'Gwadar', 'Zahedan', 'Kandahar', 'Khiva']
cyber_data1['city'] = cyber_data1['city'].apply(lambda x: x not in sunni_cities), 
                                                other = 'Sunni_Cities')

In [None]:
cyber_data1['city'] = cyber_data1['city'].str.replace(r'''(?isx)(.*)|(.*)|(.*)|()|
                                    (.*)|()|(.*)|(.*)|(.*)|
                                    (.*)|(.*)(.*)|(.*)''', 'Shia Cities')

In [None]:
cyber_data1['city'] = cyber_data1['city'].str.replace(r'''(?isx)()|()|()|()|
                                    (.*)|(.*)|(.*)|(.*)|
                                    (.*)''', '')

In [None]:
cyber_data1['city'] = cyber_data1['city'].str.replace(r'''(?isx)()|()|()|()|
                                    (.*)|(.*)|(.*)|(.*)|
                                    (.*)''', '')

In [None]:
cyber_data1['city'] = cyber_data1['city'].str.replace(r'''(?isx)()|()|()|()|
                                    (.*)|(.*)|(.*)|(.*)|
                                    (.*)''', '')