In [32]:
import os
import tkinter as tk
import pandas as pd
import geopandas as gpd
import folium
import matplotlib.pyplot as plt
from shapely.geometry import mapping
from tkinter import filedialog



In [33]:
# load the csv file use a dialog box (from TKinter user guuide)
root = tk.Tk()
root.withdraw()
file = filedialog.askopenfilename()
df = pd.read_csv(file, encoding='latin1')

In [34]:
# load the csv into a dataframe df
# Show the first 10 rows of the csv dataset  - to let me understand what is in it
print("First 10 rows of the data:")
print(df.head(10))

print("\n-----------------------------\n")

# Count how many rows are in the data, this will let you know what happens when it is cleaned
row_count = len(df)
print("Total number of rows:", row_count)

# Count how many columns are in the data so that you can understand the size and shape of the csv dataset for the terrorism
col_count = len(df.columns)
print("Total number of columns:", col_count)

First 10 rows of the data:
        eventid  iyear  imonth  iday approxdate  extended  resolution  \
0  1.970010e+11   1970       1     0        NaN         0         NaN   
1  1.970030e+11   1970       3    31        NaN         1  03/04/1970   
2  1.971110e+11   1971      11    20        NaN         0         NaN   
3  1.973040e+11   1973       4    25        NaN         0         NaN   
4  1.973080e+11   1973       8    29        NaN         0         NaN   
5  1.974070e+11   1974       7    15        NaN         0         NaN   
6  1.974080e+11   1974       8    15        NaN         0         NaN   
7  1.974110e+11   1974      11    14        NaN         0         NaN   
8  1.974110e+11   1974      11    22        NaN         0         NaN   
9  1.975020e+11   1975       2    28        NaN         0         NaN   

   country  country_txt  region  ... provstate      city   latitude  \
0      101        Japan       4  ...   Fukouka   Fukouka  33.580412   
1      101        Japan    

In [35]:
# to reduce the date three colums into one to limit the impact processing needed  (Big o ) notation) 
if 'iyear' in df and 'imonth' in df and 'iday' in df:
    df['date'] = pd.to_datetime(
        dict(year=df['iyear'], month=df['imonth'], day=df['iday']), errors='coerce')

In [36]:
# Show the first 10 rows to check the date has been combined in the data frame
print(df.head(10))

        eventid  iyear  imonth  iday approxdate  extended  resolution  \
0  1.970010e+11   1970       1     0        NaN         0         NaN   
1  1.970030e+11   1970       3    31        NaN         1  03/04/1970   
2  1.971110e+11   1971      11    20        NaN         0         NaN   
3  1.973040e+11   1973       4    25        NaN         0         NaN   
4  1.973080e+11   1973       8    29        NaN         0         NaN   
5  1.974070e+11   1974       7    15        NaN         0         NaN   
6  1.974080e+11   1974       8    15        NaN         0         NaN   
7  1.974110e+11   1974      11    14        NaN         0         NaN   
8  1.974110e+11   1974      11    22        NaN         0         NaN   
9  1.975020e+11   1975       2    28        NaN         0         NaN   

   country  country_txt  region  ...      city   latitude   longitude  \
0      101        Japan       4  ...   Fukouka  33.580412  130.396361   
1      101        Japan       4  ...   Fukouka  33

In [37]:
# Count the total number of rows to show how many incidents there are
print("\nTotal number of rows:", len(df))


Total number of rows: 839


In [38]:
# Count how many columns are in the csv dataset
col_count = len(df.columns)
print("Total number of columns:", col_count)

Total number of columns: 22


In [39]:
# Keep only the columns that are needed for the project
columns_to_keep = ['region_txt', 'country_txt','city',  'date', 'latitude', 'longitude', 'summary']
df = df[columns_to_keep]



In [40]:
# Count how many columns are in the csv dataset
col_count = len(df.columns)
print("Total number of columns:", col_count)

# Show the first 10 rows to check the columns have been removed from the data frame
print(df.head(10))

Total number of columns: 7
  region_txt  country_txt      city       date   latitude   longitude summary
0  East Asia        Japan   Fukouka        NaT  33.580412  130.396361     NaN
1  East Asia        Japan   Fukouka 1970-03-31  33.580412  130.396361     NaN
2  East Asia       Taiwan   Unknown 1971-11-20  23.583333  119.583330     NaN
3  East Asia        Japan  Yokosuka 1973-04-25  35.281341  139.672200     NaN
4  East Asia        Japan     Tokyo 1973-08-29  35.689125  139.747742     NaN
5  East Asia        Japan     Itami 1974-07-15  34.784306  135.400947     NaN
6  East Asia  South Korea     Seoul 1974-08-15  37.566535  126.977969     NaN
7  East Asia        Japan     Tokyo 1974-11-14  35.689125  139.747742     NaN
8  East Asia        Japan   Fukouka 1974-11-22  33.580412  130.396361     NaN
9  East Asia        Japan     Tokyo 1975-02-28  35.689125  139.747742     NaN


In [41]:
# Remove rows with missing latitude or longitude - as points need to be plotted
if 'latitude' in df and 'longitude' in df:
    df = df.dropna(subset=['latitude', 'longitude'])

In [42]:
# Count the total number of rows to show how many incidents there are
print("\nTotal number of rows:", len(df))


Total number of rows: 839


In [43]:
# lowercase text for all the columns to make handelling text easier if needed
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].str.lower().str.strip()

In [44]:
# drop duplicates
df.drop_duplicates(inplace=True)

# Count the total number of rows do see how many duplicate records there were
print("\nTotal number of rows:", len(df))


Total number of rows: 694
