In [1]:
import re
import pandas as pd
import numpy as np
import gspread
import matplotlib.pyplot as plt
import seaborn as sns
from fuzzywuzzy import fuzz, process

pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')



In [2]:
gc = gspread.service_account(filename='/home/gbotemi/Documents/gbotemi_bolarinwa/project-1/secret.json') # use your google service file account here
file = "https://docs.google.com/spreadsheets/d/1SUlcukpgxf6pfFQbj6DKQZXSlOhyBOSuBdbmQ2ZbhTM/edit?usp=sharing"

sh = gc.open_by_url(file)
worksheet = sh.get_worksheet(1)

In [3]:
df = pd.DataFrame(worksheet.get_all_records())

In [4]:
data = df.copy() # store a copy of the original dataframe

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   S#                       496 non-null    int64 
 1   Date                     496 non-null    object
 2   Islamic Date             496 non-null    object
 3   Blast Day Type           496 non-null    object
 4   Holiday Type             496 non-null    object
 5   Time                     496 non-null    object
 6   City                     496 non-null    object
 7   Latitude                 496 non-null    object
 8   Longitude                496 non-null    object
 9   Province                 496 non-null    object
 10  Location                 496 non-null    object
 11  Location Category        496 non-null    object
 12  Location Sensitivity     496 non-null    object
 13  Open/Closed Space        496 non-null    object
 14  Influencing Event/Event  496 non-null    o

In [6]:
data.head()

Unnamed: 0,S#,Date,Islamic Date,Blast Day Type,Holiday Type,Time,City,Latitude,Longitude,Province,Location,Location Category,Location Sensitivity,Open/Closed Space,Influencing Event/Event,Target Type,Targeted Sect if any,Killed Min,Killed Max,Injured Min,Injured Max,No. of Suicide Blasts,Explosive Weight (max),Hospital Names,Temperature(C),Temperature(F)
0,1,Sunday-November 19-1995,25 Jumaada al-THaany 1416 A.H,Holiday,Weekend,,Islamabad,33.718,73.0718,Capital,Egyptian Embassy,Foreign,High,Closed,,Foreigner,,14.0,15,,60,2,,,15.835,60.503
1,2,Monday-November 6-2000,10 SHa`baan 1421 A.H,Working Day,,,Karachi,24.9918,66.9911,Sindh,office of Nawa-e-Waqt,Office Building,Low,Closed,,Media,,,3,,3,1,,,23.77,74.786
2,3,Wednesday-May 8-2002,25 safar 1423 A.H,Working Day,,7:45 AM,Karachi,24.9918,66.9911,Sindh,Pakistan Navy bus Parked outside Five Star She...,Hotel,Medium,Closed,,Foreigner,Christian,13.0,15,20.0,40,1,2.5 Kg,1.Jinnah Postgraduate Medical Center 2. Civil ...,31.46,88.628
3,4,Friday-June 14-2002,3 Raby` al-THaany 1423 A.H,Working Day,,11:10:00 AM,Karachi,24.9918,66.9911,Sindh,US Consulate Civil Lines Area,Foreign,High,Closed,,Foreigner,Christian,,12,,51,1,,,31.43,88.574
4,5,Friday-July 4-2003,4 Jumaada al-awal 1424 A.H,Working Day,,,Quetta,30.2095,67.0182,Baluchistan,Imambargah MeCongy Road Quetta,Religious,Medium,Closed,during Friday prayer,Religious,Shiite,44.0,47,,65,1,,1.CMH Quetta \n2.Civil Hospital 3. Boland Medi...,33.12,91.616


In [59]:
data = data.replace("", np.nan) # replace empty string with numpy nan values
data = data.replace("N/A", np.nan) # replace N/A string with numpy nan values
data = data.replace("NA", np.nan) # replace NA string with numpy nan values
data = data.replace("None", np.nan) # replace None str with numpy nan values
data.columns = data.columns.str.lower() # lower columns values

In [8]:
data['date'] = pd.to_datetime(data['date'], errors='coerce') # convert the date into a datatime object
data['time'] = pd.to_datetime(data['time'], errors='coerce').dt.time # convert the time into a datatime object and extract the time

In [9]:
# convert the explosive weight (max) object to a float by removing every letters
data["explosive weight (max)"] = data["explosive weight (max)"].replace(r"to", "-", regex=True)
data["explosive weight (max)"] = data["explosive weight (max)"].str.replace(r"[^0-9-]+", "", regex=True)
data["explosive weight (max)"] = data["explosive weight (max)"].replace(r"\d*-", "", regex=True).replace("", np.nan).astype("float")

In [10]:
# clean injured max columns by removing letters and symbols, then convert the output to a float.
data["injured max"] = data["injured max"].replace(r"[^A-z ]", "", regex=True)
data["injured max"] = data["injured max"].replace(r"[A-z]", "", regex=True)
data["injured max"] = data["injured max"].replace("", np.nan).astype("float")

In [11]:
data.columns

Index(['s#', 'date', 'islamic date', 'blast day type', 'holiday type', 'time',
       'city', 'latitude', 'longitude', 'province', 'location',
       'location category', 'location sensitivity', 'open/closed space',
       'influencing event/event', 'target type', 'targeted sect if any',
       'killed min', 'killed max', 'injured min', 'injured max',
       'no. of suicide blasts', 'explosive weight (max)', 'hospital names',
       'temperature(c)', 'temperature(f)'],
      dtype='object')

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 496 entries, 0 to 495
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   s#                       496 non-null    int64         
 1   date                     452 non-null    datetime64[ns]
 2   islamic date             342 non-null    object        
 3   blast day type           486 non-null    object        
 4   holiday type             72 non-null     object        
 5   time                     195 non-null    object        
 6   city                     496 non-null    object        
 7   latitude                 493 non-null    float64       
 8   longitude                493 non-null    float64       
 9   province                 496 non-null    object        
 10  location                 493 non-null    object        
 11  location category        460 non-null    object        
 12  location sensitivity     460 non-nul

## Using Fuzzy to deal with inconsistent object data

In [13]:
data["blast day type"].unique()

array(['Holiday', 'Working Day', nan, 'Weekend'], dtype=object)

In [67]:
def replace_txt_fuzzy(check_str: str, col: str, df: pd.DataFrame) -> pd.Series:
    """
    function to detect words similar to check_str in the column data using fuzzy process.
    replace check_str with the most similar word
    """
    unique_features = df[col].unique().tolist()
    if check_str in unique_features:
        unique_features.remove(check_str)
        
        closest_match = process.extractOne(check_str, unique_features)[0]
        df[col] = df[col].replace(check_str, closest_match)

    return df[col]
    

In [68]:
data["holiday type"].unique() 

array(['Weekend', nan, 'Christmas/birthday of Quaid-e-Azam', 'Ashura',
       'Eid Milad un-Nabi', 'Iqbal Day', 'Eid-ul-azha', 'Labour Day',
       'Eid-ul-Fitar', 'Pakistan Day', 'Defence Day', 'General Elections',
       'Eid Holidays', 'Ashura Holiday'], dtype=object)

In [69]:
holiday_values = ["Eid ul Azha Holiday", "Christmas/ birthday of Quaid-e-Azam"]

for holiday in holiday_values:
    replace_txt_fuzzy(holiday, "holiday type", data)

data["holiday type"].unique()

In [24]:
np.sort(data.city.unique().astype('str'))
# there is a lot of inconsistent values, using the replace_txt_fuzzy func will be useful here

array(['ATTOCK', 'Attock', 'Bajaur Agency', 'Bannu', 'Bhakkar', 'Buner',
       'Chakwal', 'Chaman', 'Charsadda', 'D. I Khan', 'D.G Khan',
       'D.I Khan', 'Dara Adam Khel', 'Dara Adam khel', 'Fateh Jang',
       'Ghallanai, Mohmand Agency ', 'Gujrat', 'Hangu', 'Haripur',
       'Hayatabad', 'Islamabad', 'Jacobabad', 'KURRAM AGENCY', 'Karachi',
       'Karak', 'Khanewal', 'Khuzdar', 'Khyber Agency', 'Kohat',
       'Kuram Agency', 'Lahore', 'Lakki Marwat', 'Lakki marwat',
       'Lasbela', 'Lower Dir', 'MULTAN', 'Malakand', 'Mansehra', 'Mardan',
       'Mohmand Agency', 'Mohmand agency', 'Mosal Kor, Mohmand Agency',
       'Multan', 'Muzaffarabad', 'North Waziristan', 'North waziristan',
       'Nowshehra', 'Orakzai Agency', 'Peshawar', 'Pishin', 'Poonch',
       'Quetta', 'Rawalpindi', 'Sargodha', 'Sehwan town',
       'Shabqadar-Charsadda', 'Shangla', 'Shikarpur', 'Sialkot',
       'South Waziristan', 'South waziristan', 'Sudhanoti', 'Sukkur',
       'Swabi', 'Swat', 'Taftan', 'Tan

In [25]:
city_values = ["ATTOCK", "D.G Khan", "D. I Khan", "Dara Adam khel", "Lakki marwat", "Mohmand agency", "MULTAN", "Mosal Kor, Mohmand Agency", "North waziristan", "South waziristan", ]

for city in city_values:
    replace_txt_fuzzy(city, "city", data)
    
np.sort(data.city.unique().astype('str'))

In [30]:
data.province.unique()

array(['Capital', 'Sindh', 'Baluchistan', 'Punjab', 'Fata', 'KPK', 'AJK',
       'Balochistan'], dtype=object)

In [44]:
replace_txt_fuzzy("FATA", "province", data)
data.province.unique()

array(['Capital', 'Sindh', 'Baluchistan', 'Punjab', 'Fata', 'KPK', 'AJK',
       'Balochistan'], dtype=object)

In [49]:
data["location"].unique()

array(['Egyptian Embassy', 'office of Nawa-e-Waqt',
       'Pakistan Navy bus Parked outside Five Star Sheraton Hotel',
       'US Consulate Civil Lines Area', 'Imambargah MeCongy Road Quetta',
       'Jhanda Chichi area rawalpindi',
       'Yadgar-i-Hussaini in Satellite Town',
       'Karachi Shia mosque Sindh Madrassatul Islam',
       'Imambargah off M.A Jinnah road near Numaish intersection',
       'checkpost in north waziristan-close to Afghanistan border',
       'Near military facility/official Residence of General Officer Commanding',
       'Mr Aziz election rally in jaffar village in fateh jang',
       'zainabia mosque-just a Km away from city police station on Raja road',
       'Jamia Masjid Kashmirian in Mochi Gate', 'Gandava Town Shrine',
       'Baidara village of Matta-District Swat', 'Shrine of Bari Imam',
       'Courtyard of an Imambargah(Madinatul Ilm ) in Gulshan e Iqbal',
       'Hangu-Tall road-Ashura Procession',
       'Infront of USA consulate near marriott

In [36]:
np.sort(data["location category"].unique().astype('str'))

array(['Airport', 'Bank', 'Civilian', 'Commercial/residence',
       'Educational', 'Foreign', 'Foreigner', 'Government',
       'Government Official', 'Government/Office Building', 'Highway',
       'Hospital', 'Hotel', 'Market', 'Military', 'Mobile',
       'Office Building', 'Park/Ground', 'Police', 'Religious',
       'Residence', 'Residential Building', 'Transport', 'foreign', 'nan'],
      dtype='<U26')

In [37]:
location_category_values = ["Foreigner", "Residence", "Government Official", "Government/Office Building", "foreign"]

for location in location_category_values:
    replace_txt_fuzzy(location, "location category", data)
    
np.sort(data["location category"].unique().astype('str'))

In [41]:
data["location sensitivity"].unique()

array(['High', 'Low', 'Medium', nan, 'low'], dtype=object)

In [43]:
replace_txt_fuzzy("low", "location sensitivity", data)
data["location sensitivity"].unique()

array(['High', 'Low', 'Medium', nan], dtype=object)

In [45]:
data["open/closed space"].unique()

array(['Closed', 'Open', 'open', nan, 'closed', 'Open/Closed'],
      dtype=object)

In [46]:
open_closed_values = ["Closed", "Open"]

for open_close in open_closed_values:
    replace_txt_fuzzy(open_close, "open/closed space", data)
    
np.sort(data["open/closed space"].unique().astype('str'))

In [48]:
data["influencing event/event"].unique()

array([nan, 'during Friday prayer',
       "president's/chief of army staff convoy passing from there",
       'maghrib prayer was in process', 'Friday prayer was in progress',
       'Army', 'Election rally', 'Friday prayer',
       'Jamia Masjid /Shia Mosque', 'Urs Ceremony',
       'Shia Muslim Annual \nCongregation ', 'evening prayer',
       'Ashura Day',
       'Terrorists attacked convoy of david foy-american diplomat.',
       'Eid Miladun Nabi', 'Army convoy', 'Recruits parade', 'police van',
       'Military convoy', 'Indian High Commission Function',
       'Muharram procession about to start', 'Muharram procession',
       'Rally Charsadda', 'rammed the vehicle into FC vehicle',
       'Frontier Constabulary convoy', 'recruitment process in progress',
       'police bus going from Kabal to Matta',
       'Chief Justice Iftikhar muhammad ch had to address overthere',
       'chinese engineers were going from Karachi to Hub',
       'military convoy', 'truck carrying security

In [52]:
np.sort(data["target type"].unique().astype("str"))

array(['Anti-Militants', 'Army', 'Children/Women', 'Civilian',
       'Civilian & Police', 'Civilian Judges', 'Foreigner',
       'Frontier Corps', 'Government Official', 'Government official',
       'Judges & lawyers', 'Media', 'Military', 'Police',
       'Police & Rangers', 'Rangers', 'Religious', 'Shia sect', 'Unknown',
       'advocates (lawyers)', 'civilian', 'foreigner', 'nan', 'police',
       'religious'], dtype='<U19')

In [56]:
target_type_values = ["Government official", "foreigner", "advocates (lawyers)", "police", "civilian", "religious"]

for target_type in target_type_values:
    replace_txt_fuzzy(target_type, "target type", data)
    
np.sort(data["target type"].unique().astype('str'))

In [60]:
data["targeted sect if any"].unique()

array([nan, 'Christian', 'Shiite', 'shiite', 'Shiite/sunni', 'Sunni',
       'Jews', 'Ahmedi'], dtype=object)

In [61]:
replace_txt_fuzzy("shiite", "targeted sect if any", data)
data["targeted sect if any"].unique()

array([nan, 'Christian', 'Shiite', 'Shiite/sunni', 'Sunni', 'Jews',
       'Ahmedi'], dtype=object)

In [62]:
data["hospital names"].unique()

array([nan,
       '1.Jinnah Postgraduate Medical Center 2. Civil Hospital Karachi 3. PN Shifa',
       '1.CMH Quetta \n2.Civil Hospital 3. Boland Medical Complex',
       '1.District headquarters \nHospital ', 'Civil hospital',
       'Civil hospital-Liaquat National hospital-Jinnah Postgraduate Medical Center',
       'CMH Kohat',
       '1.District Headquarters hospital-rwp 2.PIMS Hospital 3. Tehsil headquarters hospital-Fateh Jang',
       'Allama Iqbal Memorial DHQ Hospital-Sardar Begum Hospital-and numner of other hospitals in Sialkot-Daska-Gujranwala and Lahore',
       'Mayo Hospital',
       '1. PIMS 2.Polyclinic 3.CDA Hospital 4.federal govt services 5.rwp generl hosp',
       '1. Patel Hospital 2. JMPC', 'Hangu Hospital',
       '1. Liaquat National Hospital 2. Civil Hospital KHI 3. JPMC 4. Abbasi Shaheed Hospital',
       '1. Combined Military 2. Hospital Peshawar',
       '1. CMH Mardan\n2. CMH Peshawar', '1.poly Clinic Hospital',
       '1.Lady Reading hospital', '1.DIK H

In [63]:
data.head()

Unnamed: 0,s#,date,islamic date,blast day type,holiday type,time,city,latitude,longitude,province,location,location category,location sensitivity,open/closed space,influencing event/event,target type,targeted sect if any,killed min,killed max,injured min,injured max,no. of suicide blasts,explosive weight (max),hospital names,temperature(c),temperature(f)
0,1,1995-11-19,25 Jumaada al-THaany 1416 A.H,Holiday,Weekend,NaT,Islamabad,33.718,73.0718,Capital,Egyptian Embassy,Foreign,High,closed,,Foreigner,,14.0,15.0,,60.0,2.0,,,15.835,60.503
1,2,2000-11-06,10 SHa`baan 1421 A.H,Working Day,,NaT,Karachi,24.9918,66.9911,Sindh,office of Nawa-e-Waqt,Office Building,Low,closed,,Media,,,3.0,,3.0,1.0,,,23.77,74.786
2,3,2002-05-08,25 safar 1423 A.H,Working Day,,07:45:00,Karachi,24.9918,66.9911,Sindh,Pakistan Navy bus Parked outside Five Star She...,Hotel,Medium,closed,,Foreigner,Christian,13.0,15.0,20.0,40.0,1.0,25.0,1.Jinnah Postgraduate Medical Center 2. Civil ...,31.46,88.628
3,4,2002-06-14,3 Raby` al-THaany 1423 A.H,Working Day,,11:10:00,Karachi,24.9918,66.9911,Sindh,US Consulate Civil Lines Area,Foreign,High,closed,,Foreigner,Christian,,12.0,,51.0,1.0,,,31.43,88.574
4,5,2003-07-04,4 Jumaada al-awal 1424 A.H,Working Day,,NaT,Quetta,30.2095,67.0182,Baluchistan,Imambargah MeCongy Road Quetta,Religious,Medium,closed,during Friday prayer,Religious,Shiite,44.0,47.0,,65.0,1.0,,1.CMH Quetta \n2.Civil Hospital 3. Boland Medi...,33.12,91.616


In [64]:
data.to_csv("data/modified_inconsistent_data.csv", index=False) # export cleaned data