# Data Cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load the Data of Terrorism dataset

In [2]:
df = pd.read_csv('data_of_terrorism.csv')
df.drop('Unnamed: 0.1', inplace=True, axis=1)
df.drop('Unnamed: 0', inplace=True, axis=1)
df.drop('Part of Multiple Incident?', inplace=True, axis=1)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 28 columns):
CITY                                         30000 non-null object
COUNTRY                                      30000 non-null object
Claimed Responsibility                       30000 non-null object
DATE                                         30000 non-null object
FATALITIES                                   30000 non-null object
Group Name of Perpetrator Group              30000 non-null object
Group Sub Name of Perpetrator Group          30000 non-null object
Hostages                                     30000 non-null object
INJURED                                      30000 non-null object
LOCATION DETAILS                             30000 non-null object
Name of Entity                               30000 non-null object
Nationality of Target                        30000 non-null object
Number of Perpetrator Fatalities             30000 non-null object
Number of Perpetrato

In [4]:
df = df.replace('Unknown', np.nan)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 28 columns):
CITY                                         28173 non-null object
COUNTRY                                      30000 non-null object
Claimed Responsibility                       30000 non-null object
DATE                                         30000 non-null object
FATALITIES                                   27985 non-null object
Group Name of Perpetrator Group              15252 non-null object
Group Sub Name of Perpetrator Group          1261 non-null object
Hostages                                     29963 non-null object
INJURED                                      27017 non-null object
LOCATION DETAILS                             13435 non-null object
Name of Entity                               23581 non-null object
Nationality of Target                        29847 non-null object
Number of Perpetrator Fatalities             29275 non-null object
Number of Perpetrator

In [6]:
print("Inside the data table there are", df.duplicated().sum() ,"duplicates")

Inside the data table there are 1386 duplicates


In [7]:
df = df.drop_duplicates().reset_index(drop=True).copy()

In [8]:
df.shape

(28614, 28)

### Check the amount of missing values in each column

In [9]:
for c in df.columns:
    missing = df[c].isna().sum()
    print("in column {} there are {:.3f}% missing values".format(c, missing*100/len(df)))

in column CITY there are 6.060% missing values
in column COUNTRY there are 0.000% missing values
in column Claimed Responsibility there are 0.000% missing values
in column DATE there are 0.000% missing values
in column FATALITIES there are 6.790% missing values
in column Group Name of Perpetrator Group there are 48.948% missing values
in column Group Sub Name of Perpetrator Group there are 95.645% missing values
in column Hostages there are 0.101% missing values
in column INJURED there are 10.128% missing values
in column LOCATION DETAILS there are 54.033% missing values
in column Name of Entity there are 20.619% missing values
in column Nationality of Target there are 0.531% missing values
in column Number of Perpetrator Fatalities there are 2.523% missing values
in column Number of Perpetrators there are 87.471% missing values
in column PERPETRATOR GROUP there are 48.948% missing values
in column PROVINCE/ADMINISTRATIVE REGION/U.S. STATE there are 0.706% missing values
in column Prop

### Create dictionary of terrorist cities

In [10]:
Terrorist_cities = dict()
for row in range(df.shape[0]):
    if (type(df.CITY[row]) ==  type("string")):                                    #   df.CITY[row] != np.nan
        Terrorist_cities.update({df.COUNTRY[row] : df.CITY[row]})
Terrorist_cities
        

{'Algeria': 'Bordj Menaiel',
 'Bahrain': 'Demistan',
 'Egypt': 'Arish',
 'Iran': 'Zabol',
 'Iraq': 'Mosul',
 'Israel': 'Afula',
 'Jordan': 'Maan',
 'Kuwait': 'Kuwait',
 'Lebanon': 'Jlala',
 'Libya': 'Benghazi',
 'Morocco': 'Casablanca',
 'Qatar': 'Doha',
 'Saudi Arabia': 'Hafr al-Batin',
 'Syria': 'Damascus',
 'Tunisia': 'Sousse',
 'Turkey': 'Cizre',
 'United Arab Emirates': 'Abu Dhabi',
 'West Bank and Gaza Strip': 'Huwwara',
 'Western Sahara': 'Tifariti',
 'Yemen': 'Ahwar'}

In [11]:
for row in range(df.shape[0]):
    if (type(df.CITY[row]) !=  type("string")):                                       #   df.CITY[row] == np.nan
        df.CITY[row] = Terrorist_cities[df.COUNTRY[row]]
        

In [12]:
df =df.drop(columns=['Total Number of Fatalities','Total Number of Injured','Number of Perpetrators',"Group Sub Name of Perpetrator Group"])


In [13]:
replace_map = {'No':0, 'Yes':1}

In [14]:
df.Hostages.replace(replace_map,inplace = True)

In [15]:
df.Hostages = df.Hostages.fillna(df.Hostages.mode()[0])

In [16]:
df.Ransom.replace(replace_map,inplace = True)

In [17]:
df.Ransom = df.Ransom.fillna(df.Ransom.mode()[0])

In [18]:
df["Property Damage"].replace(replace_map,inplace = True)

In [19]:
df["Property Damage"] = df["Property Damage"].fillna(df["Property Damage"].mode()[0])

In [20]:
df.FATALITIES = df.FATALITIES.fillna(df.FATALITIES.median())

In [21]:
df.INJURED  = df.INJURED.fillna(df.INJURED.median())

In [22]:
df['Number of Perpetrator Fatalities'] = df['Number of Perpetrator Fatalities'].fillna(df['Number of Perpetrator Fatalities'].median())

In [23]:
df["TARGET TYPE"] = df["TARGET TYPE"].fillna(df["TARGET TYPE"].mode()[0])

In [24]:

for row in range(df.shape[0]):
    if ( type(df["Nationality of Target"][row]) != type("string")):                                       #   df["Nationality of Target"][row] == np.nan
        df["Nationality of Target"][row] = df.COUNTRY[row]

# df["Nationality of Target"][row] = df["Nationality of Target"].fillna(df.COUNTRY[row])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [25]:
df["Specific Description"] = df["Specific Description"].fillna(df["Specific Description"].mode()[0])

In [26]:
df["Property Damage"] = df["Property Damage"].fillna(df["Property Damage"].mode()[0])

In [27]:
df["Name of Entity"] = df["Name of Entity"].fillna(df["Name of Entity"].mode()[0])

In [28]:
df["Type of Attack"] = df["Type of Attack"].fillna(df["Type of Attack"].mode()[0])

In [29]:
df["Weapon Sub_type"] = df["Weapon Sub_type"].fillna(df["Weapon Sub_type"].mode()[0])

In [30]:
df["Weapon Type"] = df["Weapon Type"].fillna(df["Weapon Type"].mode()[0])

In [31]:
for row in range(df.shape[0]):
    if ( type(df["PROVINCE/ADMINISTRATIVE REGION/U.S. STATE"][row]) != type("string")):                                       #   df["Nationality of Target"][row] == np.nan
        df["PROVINCE/ADMINISTRATIVE REGION/U.S. STATE"][row] = df.COUNTRY[row]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28614 entries, 0 to 28613
Data columns (total 24 columns):
CITY                                         28614 non-null object
COUNTRY                                      28614 non-null object
Claimed Responsibility                       28614 non-null object
DATE                                         28614 non-null object
FATALITIES                                   28614 non-null object
Group Name of Perpetrator Group              14608 non-null object
Hostages                                     28614 non-null float64
INJURED                                      28614 non-null object
LOCATION DETAILS                             13153 non-null object
Name of Entity                               28614 non-null object
Nationality of Target                        28614 non-null object
Number of Perpetrator Fatalities             28614 non-null object
PERPETRATOR GROUP                            14608 non-null object
PROVINCE/ADMINISTRA

In [33]:
df.to_csv('data_of_terrorism_after_cleaning.csv')
