In [1]:
import pandas as pd
import numpy as np

In [2]:
data=pd.read_csv('Egypt_terr.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,eventid,iyear,imonth,iday,extended,country,country_txt,region,region_txt,...,weaptype1,weaptype1_txt,nkill,property,ishostkid,dbsource,INT_LOG,INT_IDEO,INT_MISC,INT_ANY
0,133,197003140002,1970,3,14,0,60,Egypt,10,Middle East & North Africa,...,6,Explosives,0.0,1,0.0,PGIS,-9,-9,0,-9
1,1092,197111280001,1971,11,28,0,60,Egypt,10,Middle East & North Africa,...,5,Firearms,1.0,0,0.0,PGIS,1,1,0,1
2,1181,197202190001,1972,2,19,0,60,Egypt,10,Middle East & North Africa,...,6,Explosives,0.0,0,1.0,Hijacking DB,0,1,1,1
3,5603,197712050001,1977,12,5,0,60,Egypt,10,Middle East & North Africa,...,5,Firearms,1.0,0,0.0,PGIS,-9,-9,1,1
4,6797,197809300002,1978,9,30,0,60,Egypt,10,Middle East & North Africa,...,6,Explosives,0.0,1,0.0,PGIS,1,0,1,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2479 entries, 0 to 2478
Data columns (total 45 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        2479 non-null   int64  
 1   eventid           2479 non-null   int64  
 2   iyear             2479 non-null   int64  
 3   imonth            2479 non-null   int64  
 4   iday              2479 non-null   int64  
 5   extended          2479 non-null   int64  
 6   country           2479 non-null   int64  
 7   country_txt       2479 non-null   object 
 8   region            2479 non-null   int64  
 9   region_txt        2479 non-null   object 
 10  provstate         2461 non-null   object 
 11  city              2478 non-null   object 
 12  latitude          2458 non-null   float64
 13  longitude         2458 non-null   float64
 14  specificity       2479 non-null   float64
 15  vicinity          2479 non-null   int64  
 16  crit1             2479 non-null   int64  


In [61]:
def missingvaluesdf():

    missing_values=pd.DataFrame(data.isna().sum())

    filt=(missing_values.iloc[:,0]>0)
    return missing_values[filt]

Let's explore more in detail the missing data

In [5]:
data.loc[data.provstate.isna()==True][['provstate','city']]

Unnamed: 0,provstate,city
5,,Unknown
7,,Alexandria
21,,Giza
58,,Hurghada
66,,Beni Suef
72,,Damietta
109,,Luxor
112,,Faiyum
125,,Port Said
163,,Mir


In [6]:
#Having the provstate for each city would be a great help. Let's see if we can use the rest of the dataset for it
provcity=data[['provstate','city']].dropna(axis=0,how='any')


In [7]:
provcity.drop_duplicates(keep='first',ignore_index=True,inplace=True)

In [8]:
provcity.nunique()

provstate     43
city         327
dtype: int64

In [9]:
dict_city=provcity.set_index('city').to_dict()['provstate']

In [10]:
len(dict_city)

327

In [20]:
#Now we have a dictionary {city: provstate}, we should be able to fill in the missing values in provstate
data['provstate']=data['provstate'].fillna(data['city'].apply(lambda x: dict_city.get(x)))

In [21]:
missing_values=pd.DataFrame(data.isna().sum())

filt=(missing_values.iloc[:,0]>0)
missing_values[filt]

Unnamed: 0,0
provstate,4
city,1
latitude,21
longitude,21
targsubtype1,238
targsubtype1_txt,238
target1,5
nkill,33


In [26]:
#4 values missing now in provstate. Let's see if we can fill them in manually, or if we just drop them
data.loc[data['provstate'].isna()==True][['city','latitude','longitude']]

Unnamed: 0,city,latitude,longitude
163,Mir,27.44169,30.746817
219,Abu Mawas,27.641389,30.849444
245,Izbat Sayk Basha,30.758611,31.735833
248,Idfa,26.571904,31.638356


In [31]:
#4 keys are missing, we will add them manually and update our dictionary
dict_city2={'Mir':'Asyut','Abu Mawas':'Minya','Izbat Sayk Basha':'Al Sharqia','Idfa':'Sohag'}
dict_city.update(dict_city2)

In [32]:
data['provstate']=data['provstate'].fillna(data['city'].apply(lambda x: dict_city.get(x)))

In [33]:
missing_values=pd.DataFrame(data.isna().sum())

filt=(missing_values.iloc[:,0]>0)
missing_values[filt]

Unnamed: 0,0
city,1
latitude,21
longitude,21
targsubtype1,238
targsubtype1_txt,238
target1,5
nkill,33


In [35]:
data.loc[data['city'].isna()==True][['city','latitude','longitude']]

Unnamed: 0,city,latitude,longitude
496,,30.608472,33.617577


In [37]:
#The city is Al Hasna, let's fill it manually
data['city']=data['city'].fillna(value='Al Hasna')

In [38]:

missing_values=pd.DataFrame(data.isna().sum())

filt=(missing_values.iloc[:,0]>0)
missing_values[filt]

Unnamed: 0,0
latitude,21
longitude,21
targsubtype1,238
targsubtype1_txt,238
target1,5
nkill,33


In [40]:
#Let's check latitude and longitude
data.loc[data.latitude.isna()==True]['city']

68             Unknown
76             Unknown
79             Unknown
138        Upper Egypt
214     Southern Egypt
215     Southern Egypt
291         Edka-Hener
292              Hener
299              Esfay
582            Unknown
724            Unknown
725            Unknown
770            Unknown
779            Unknown
1057           Unknown
1086           Unknown
1172           Unknown
1173           Unknown
1610           Unknown
1620           Unknown
1836           Unknown
Name: city, dtype: object

In [43]:
#Even the cities are unknown. However, we don't know for sure it will prevent us from analyzing the dataset. We should maybe first 
#replace with "0" before analyzing locations

data[['latitude','longitude']]=data[['latitude','longitude']].fillna(value=0)

In [44]:
missing_values=pd.DataFrame(data.isna().sum())

filt=(missing_values.iloc[:,0]>0)
missing_values[filt]

Unnamed: 0,0
targsubtype1,238
targsubtype1_txt,238
target1,5
nkill,33


In [50]:
#Let's check targsubtype1 and targsubtype1_txt
data.loc[data.targsubtype1.isna()==True][['targtype1','targtype1_txt','targsubtype1','targsubtype1_txt']]

Unnamed: 0,targtype1,targtype1_txt,targsubtype1,targsubtype1_txt
18,20,Unknown,,
45,2,Government (General),,
111,8,Educational Institution,,
273,17,Terrorists/Non-State Militia,,
276,17,Terrorists/Non-State Militia,,
...,...,...,...,...
2422,20,Unknown,,
2435,20,Unknown,,
2458,20,Unknown,,
2467,20,Unknown,,


In [56]:
data.loc[data.targtype1_txt=='Unknown'][['targtype1','targtype1_txt','targsubtype1','targsubtype1_txt']]['targtype1'].unique()

array([20], dtype=int64)

In [57]:
#The targtype 20 doesn't seem very useful, let's drop it
data=data.loc[data.targtype1!=20]

In [62]:
missingvaluesdf()

Unnamed: 0,0
targsubtype1,27
targsubtype1_txt,27
target1,5
nkill,32


In [63]:
data.shape

(2268, 45)