In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import hvplot.pandas

# Extend width of the results
pd.set_option('max_colwidth', 400)

### Bear Attacks data retrieval

In [2]:
# Read the data into a Pandas DataFrame
bear_data_raw_df = pd.read_csv("Resources/source/bear_attacks.csv")
bear_data_raw_df.head()

Unnamed: 0,Date,Location,Details,Bear,Latitude,Longitude,Name,Age,Gender
0,"August 23, 2018","Lyon Inlet, Nunavut","Three men from Naujaat, whose boat had broken down, were having tea on the morning of August 23 when a female bear and a cub surprised them. Leo Ijjangiaq fired a rifle to scare the bear but it attacked Laurent Junior Uttak before killing Darryl Kaunak. The mother and cub were killed. During the next three days, prior to being rescued, more bears approached the two survivors and at least one m...",Polar,66.53416,-83.88217,Darryl Kaunak,33,male
1,"July 3, 2018","Sentry Island, Nunavut","A polar bear approached a man and his children on Sentry Island. The man, identified as 31-year-old Aaron Gibbons from Arviat, put himself between the children and the bear and was attacked, causing fatal injuries. The bear was killed by other people who were also in the area.[84][85]",Polar,61.16723,-93.85015,Aaron Gibbons,31,male
2,"July 9, 1999","near Rankin Inlet, Nunavut","Amitnak was mauled after trying to distract a bear that attacked and injured two other people at a Hudson Bay camp.[161] She was later awarded a posthumous medal of bravery by then-Governor-General of Canada, Adrienne Clarkson.[162]",Polar,62.808913,-92.087741,Hattie Amitnak,64,female
3,"December 8, 1990","Point Lay, Alaska","While Stalker was walking with his girlfriend, he was chased and consumed in the middle of the town. The bear was shot and killed near Stalker's corpse.[163]",Polar,69.7428,-163.01125,Carl Stalker,28,male
4,"November 29, 1983","Churchill, Manitoba","Mutanen was attacked and dragged on a street in Churchill. The bear was part of an annual migration to Hudson Bay. Due to a lack of ice on the bay, the bear wandered into the town.[188]",Polar,58.767755,-94.163998,Thomas Mutanen,46,male


In [3]:
# Get a brief summary of the shark_data_raw_df DataFrame
bear_data_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       156 non-null    object 
 1   Location   156 non-null    object 
 2   Details    156 non-null    object 
 3   Bear       156 non-null    object 
 4   Latitude   156 non-null    float64
 5   Longitude  156 non-null    float64
 6   Name       155 non-null    object 
 7   Age        155 non-null    object 
 8   Gender     155 non-null    object 
dtypes: float64(2), object(7)
memory usage: 11.1+ KB


In [4]:
# Brief summary of the shark_data_raw_df DataFrame
bear_data_raw_df.describe()

Unnamed: 0,Latitude,Longitude
count,156.0,156.0
mean,51.014942,-113.889579
std,8.825507,22.993165
min,28.647838,-163.01125
25%,45.25021,-124.686183
50%,49.814079,-113.780475
75%,58.41667,-105.525439
max,69.7428,-66.825422


### Bear Data Cleaning

In [5]:
# Copy the raw data to a new DataFrame
bear_data_df = bear_data_raw_df.copy()
bear_data_df.head()

Unnamed: 0,Date,Location,Details,Bear,Latitude,Longitude,Name,Age,Gender
0,"August 23, 2018","Lyon Inlet, Nunavut","Three men from Naujaat, whose boat had broken down, were having tea on the morning of August 23 when a female bear and a cub surprised them. Leo Ijjangiaq fired a rifle to scare the bear but it attacked Laurent Junior Uttak before killing Darryl Kaunak. The mother and cub were killed. During the next three days, prior to being rescued, more bears approached the two survivors and at least one m...",Polar,66.53416,-83.88217,Darryl Kaunak,33,male
1,"July 3, 2018","Sentry Island, Nunavut","A polar bear approached a man and his children on Sentry Island. The man, identified as 31-year-old Aaron Gibbons from Arviat, put himself between the children and the bear and was attacked, causing fatal injuries. The bear was killed by other people who were also in the area.[84][85]",Polar,61.16723,-93.85015,Aaron Gibbons,31,male
2,"July 9, 1999","near Rankin Inlet, Nunavut","Amitnak was mauled after trying to distract a bear that attacked and injured two other people at a Hudson Bay camp.[161] She was later awarded a posthumous medal of bravery by then-Governor-General of Canada, Adrienne Clarkson.[162]",Polar,62.808913,-92.087741,Hattie Amitnak,64,female
3,"December 8, 1990","Point Lay, Alaska","While Stalker was walking with his girlfriend, he was chased and consumed in the middle of the town. The bear was shot and killed near Stalker's corpse.[163]",Polar,69.7428,-163.01125,Carl Stalker,28,male
4,"November 29, 1983","Churchill, Manitoba","Mutanen was attacked and dragged on a street in Churchill. The bear was part of an annual migration to Hudson Bay. Due to a lack of ice on the bay, the bear wandered into the town.[188]",Polar,58.767755,-94.163998,Thomas Mutanen,46,male


In [6]:
bear_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       156 non-null    object 
 1   Location   156 non-null    object 
 2   Details    156 non-null    object 
 3   Bear       156 non-null    object 
 4   Latitude   156 non-null    float64
 5   Longitude  156 non-null    float64
 6   Name       155 non-null    object 
 7   Age        155 non-null    object 
 8   Gender     155 non-null    object 
dtypes: float64(2), object(7)
memory usage: 11.1+ KB


In [7]:
# Change headers to lowercase and replace spaces with underscores
bear_data_df.columns = bear_data_df.columns.str.lower().str.replace(" ", "_")
bear_data_df.head()

Unnamed: 0,date,location,details,bear,latitude,longitude,name,age,gender
0,"August 23, 2018","Lyon Inlet, Nunavut","Three men from Naujaat, whose boat had broken down, were having tea on the morning of August 23 when a female bear and a cub surprised them. Leo Ijjangiaq fired a rifle to scare the bear but it attacked Laurent Junior Uttak before killing Darryl Kaunak. The mother and cub were killed. During the next three days, prior to being rescued, more bears approached the two survivors and at least one m...",Polar,66.53416,-83.88217,Darryl Kaunak,33,male
1,"July 3, 2018","Sentry Island, Nunavut","A polar bear approached a man and his children on Sentry Island. The man, identified as 31-year-old Aaron Gibbons from Arviat, put himself between the children and the bear and was attacked, causing fatal injuries. The bear was killed by other people who were also in the area.[84][85]",Polar,61.16723,-93.85015,Aaron Gibbons,31,male
2,"July 9, 1999","near Rankin Inlet, Nunavut","Amitnak was mauled after trying to distract a bear that attacked and injured two other people at a Hudson Bay camp.[161] She was later awarded a posthumous medal of bravery by then-Governor-General of Canada, Adrienne Clarkson.[162]",Polar,62.808913,-92.087741,Hattie Amitnak,64,female
3,"December 8, 1990","Point Lay, Alaska","While Stalker was walking with his girlfriend, he was chased and consumed in the middle of the town. The bear was shot and killed near Stalker's corpse.[163]",Polar,69.7428,-163.01125,Carl Stalker,28,male
4,"November 29, 1983","Churchill, Manitoba","Mutanen was attacked and dragged on a street in Churchill. The bear was part of an annual migration to Hudson Bay. Due to a lack of ice on the bay, the bear wandered into the town.[188]",Polar,58.767755,-94.163998,Thomas Mutanen,46,male


#### Clean Date

In [8]:
# Verify the original date values
print(bear_data_df.date.unique())

['August 23, 2018' 'July 3, 2018' 'July 9, 1999' 'December 8, 1990'
 'November 29, 1983' 'January 5, 1975' 'November 17, 1968'
 'September 1, 2019' 'June 19, 2017' 'June 18, 2017' 'May 10, 2015'
 'September 21, 2014' 'May 7, 2014' 'June 6, 2013' 'July 25, 2011'
 'June 2011' 'August 7, 2009' 'May 30, 2008' 'July 20, 2007'
 'June 17, 2007' 'April 13, 2006' 'September 6, 2005' 'August 26, 2005'
 'June 14, 2005' 'September 29, 2002' 'September 1, 2002'
 'August 19, 2002' 'August 18, 2001' 'June 3, 2001' 'July 2, 2000'
 'May 21, 2000' 'August 14, 1997' 'June 14, 1996' 'September 16, 1994'
 'August 10, 1993' 'July 8, 1992' 'June 14, 1992' 'October 11, 1991'
 'May 26, 1991' 'May 29, 1985' 'July 6, 1983' 'May 27, 1983'
 'May 21, 1983' 'August 14, 1980' 'July 18, 1980' 'May 13, 1978'
 'May 16, 1974' 'July 25, 1971' 'October 1, 1968' 'August 8, 1967'
 'July 2, 1965' 'September 17, 1964' 'August 16, 1963' 'September 6, 1959'
 'August 12, 1958' 'November 19, 1952' 'July 7, 1948' 'November 23, 1943

In [9]:
# Replace specific text patters in the 'Date' column (e.g. 'Reported ')
bear_data_df['date'] = bear_data_df['date'].str.replace('Reported ', '', regex=False)

# Convert 'Date' to datetime
bear_data_df['date'] = pd.to_datetime(bear_data_df['date'], errors='coerce', infer_datetime_format=True)

# Function to extract year, month, and day from datetime
def extract_date_components(date):
    if pd.notnull(date):
        return pd.Series([date.year, date.month, date.day])
    else:
        return pd.Series([np.nan, np.nan, np.nan])

# Apply the function to the 'Date' column and create new columns for year, month, and day
bear_data_df[['year', 'month', 'day']] = bear_data_df['date'].apply(extract_date_components)

# Display the cleaned DataFrame with Year, Month, and Day
print(bear_data_df[['date', 'year', 'month', 'day']].head())

# Verify the distinct values in the new columns
distinct_years = bear_data_df['year'].unique()
distinct_months = bear_data_df['month'].unique()
distinct_days = bear_data_df['day'].unique()

print(f"Distinct Years: {distinct_years}")
print(f"Distinct Months: {distinct_months}")
print(f"Distinct Days: {distinct_days}")

date_string = bear_data_df['month'].astype(str) + '-' + bear_data_df['day'].astype(str) + '-' + bear_data_df['year'].astype(str)

date_string = date_string.replace('nan-nan-nan', np.nan)
date_string = date_string.str.replace('.0', '')

print(date_string.unique)

        date    year  month   day
0 2018-08-23  2018.0    8.0  23.0
1 2018-07-03  2018.0    7.0   3.0
2 1999-07-09  1999.0    7.0   9.0
3 1990-12-08  1990.0   12.0   8.0
4 1983-11-29  1983.0   11.0  29.0
Distinct Years: [2018. 1999. 1990. 1983. 1975. 1968. 2019. 2017. 2015. 2014. 2013. 2011.
   nan 2009. 2008. 2007. 2006. 2005. 2002. 2001. 2000. 1997. 1996. 1994.
 1993. 1992. 1991. 1985. 1980. 1978. 1974. 1971. 1967. 1965. 1964. 1963.
 1959. 1958. 1952. 1948. 1943. 1906. 1901. 1883. 2016. 2012. 2010. 2003.
 1998. 1995. 1988. 1987. 1986. 1984. 1979. 1977. 1976. 1973. 1972. 1970.
 1956. 1955. 1942. 1929. 1922. 1916. 1892. 1863. 1853. 1854. 1837.]
Distinct Months: [ 8.  7. 12. 11.  1.  9.  6.  5. nan  4. 10.  2.]
Distinct Days: [23.  3.  9.  8. 29.  5. 17.  1. 19. 18. 10. 21.  7.  6. 25. nan 30. 20.
 13. 26. 14.  2. 16. 11. 27. 12. 24.  4. 15. 28. 22.]
<bound method Series.unique of 0       8-23-2018
1        7-3-2018
2        7-9-1999
3       12-8-1990
4      11-29-1983
          ...    

  bear_data_df['date'] = pd.to_datetime(bear_data_df['date'], errors='coerce', infer_datetime_format=True)


In [10]:
# Convert date to datetime
bear_data_df['date'] = pd.to_datetime(date_string, errors='coerce', format='%m-%d-%Y')
bear_data_df['date'].unique()

<DatetimeArray>
['2018-08-23 00:00:00', '2018-07-03 00:00:00', '1999-07-09 00:00:00',
 '1990-12-08 00:00:00', '1983-11-29 00:00:00', '1975-01-05 00:00:00',
 '1968-11-17 00:00:00', '2019-09-01 00:00:00', '2017-06-19 00:00:00',
 '2017-06-18 00:00:00',
 ...
 '1955-09-19 00:00:00', '1942-08-23 00:00:00', '1929-09-12 00:00:00',
 '1922-06-12 00:00:00', '1916-09-08 00:00:00', '1892-09-02 00:00:00',
 '1863-08-30 00:00:00', '1853-12-19 00:00:00', '1854-10-27 00:00:00',
 '1837-10-17 00:00:00']
Length: 130, dtype: datetime64[ns]

#### Clean age

In [11]:
# Verify the original values
bear_data_df.age.unique()

array([' 33', ' 31', ' 64', ' 28', ' 46', ' 18', ' 19', ' 62', ' 27',
       ' 16', ' 22', ' 36', ' 61', ' 72', ' 74', ' 70', ' 11', ' 6',
       ' 30', ' 69', ' 71', ' 77', ' 5 months', ' 93', ' 24', ' 50',
       ' 56', ' 37', ' 53', ' 4', ' 20', ' 32', ' 48', ' 12', ' 55',
       ' 26', ' 44', ' 10', ' 15', ' 51', ' 7', ' 3', ' 5', ' 52',
       '\xa0?', ' 82', '" 8', ' 10 months', ' 38', ' 63', ' 42', ' 54',
       ' 49', ' 59', ' 57', ' 60', ' 58', ' 35', ' 41', ' 65', ' 40',
       ' 45', ' 29', ' 25', ' 23', nan, ' 43', ' 68'], dtype=object)

In [12]:
# Function to clean and convert age values
def clean_age(value):
    # Remove leading/trailing whitespace
    value = str(value).strip()
    
    # Handle special cases
    if 'month' in value or '?' in value or 'unknown' in value.lower():
        return np.nan
    
    # Convert to numeric, setting errors='coerce' to handle non-numeric values
    try:
        # Convert value to numeric, forcing errors to NaN
        return pd.to_numeric(value, errors='coerce')
    except Exception as e:
        print(f"Error processing age: {value} - {e}")
        return np.nan

# Apply the function to the 'Age' column
bear_data_df['age'] = bear_data_df['age'].apply(clean_age)

# Display the cleaned DataFrame with the 'Age' column
print(bear_data_df[['age']].head())

# Verify the distinct values in the 'Age' column
bear_data_df['age'].unique()

    age
0  33.0
1  31.0
2  64.0
3  28.0
4  46.0


array([33., 31., 64., 28., 46., 18., 19., 62., 27., 16., 22., 36., 61.,
       72., 74., 70., 11.,  6., 30., 69., 71., 77., nan, 93., 24., 50.,
       56., 37., 53.,  4., 20., 32., 48., 12., 55., 26., 44., 10., 15.,
       51.,  7.,  3.,  5., 52., 82., 38., 63., 42., 54., 49., 59., 57.,
       60., 58., 35., 41., 65., 40., 45., 29., 25., 23., 43., 68.])

#### Clean gender

In [13]:
# Verify the original values
bear_data_df['gender'].unique()

array([' male', ' female', nan], dtype=object)

In [14]:
# Define mapping for gender
gender_mapping = {
    ' male': 'M',
    ' female': 'F',
    '': 'U',  
    np.nan: 'U'  # Handles NaN values
}
# Replace values in the 'Gender' column using the mapping
bear_data_df['gender'] = bear_data_df['gender'].replace(gender_mapping)

# Rename 'Gender' to 'Sex'
bear_data_df.rename(columns={'gender': 'sex'}, inplace=True)

# Verify the updated values
bear_data_df['sex'].unique()

array(['M', 'F', 'U'], dtype=object)

### Limit years to match years in Shark Attacks

In [15]:
# Limit the years in bear data from 1900 to 2019
bear_data_df = bear_data_df[(bear_data_df['year'] >= 1900) & (bear_data_df['year'] <= 2019)]
bear_data_df['year'].sort_values(ascending=False).unique()


array([2019., 2018., 2017., 2016., 2015., 2014., 2013., 2012., 2011.,
       2010., 2009., 2008., 2007., 2006., 2005., 2003., 2002., 2001.,
       2000., 1999., 1998., 1997., 1996., 1995., 1994., 1993., 1992.,
       1991., 1990., 1988., 1987., 1986., 1985., 1984., 1983., 1980.,
       1979., 1978., 1977., 1976., 1975., 1974., 1973., 1972., 1971.,
       1970., 1968., 1967., 1965., 1964., 1963., 1959., 1958., 1956.,
       1955., 1952., 1948., 1943., 1942., 1929., 1922., 1916., 1906.,
       1901.])

### Display a map of Bear Attacks

In [16]:
%%capture --no-display

# Configure the map plot
bear_attack = bear_data_df[["date", "latitude", "longitude"]]

# Convert date to string
bear_attack["date"] = bear_attack["date"].dt.strftime('%Y-%m-%d')

# Configure the map plot
bear_map = bear_attack.hvplot.points(
    x="longitude",
    y="latitude",
    geo=True,
    tiles="OSM",
    color = "date",
    frame_width = 700,
    frame_height = 500
)

# Display the map
bear_map

### Save Bear Attacks dataframe into CSV file in the Resources directory

In [17]:
bear_data_df.to_csv("Resources/bear_attacks_clean_data.csv", index=False)