In [34]:
# Install the necessary Python packages
!pip install pandas numpy



In [35]:
import pandas as pd
import numpy as np

# Define the path to the CSV file directly
csv_file = '../data/earthquake_1995-2023.csv'

# Load the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file)

In [36]:
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,green,0,657,us,114,7.177,25.0,mww,192.955,-13.8814,167.158,"Sola, Vanuatu",,Vanuatu
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,yellow,0,775,us,92,0.679,40.0,mww,69.727,12.814,-88.1265,"Intipucá, El Salvador",,
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,17-07-2023 03:05,7,5,green,0,899,us,70,1.634,28.0,mww,171.371,-38.1911,-70.3731,"Loncopué, Argentina",South America,Argentina
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,16-07-2023 06:48,6,6,green,1,860,us,173,0.907,36.0,mww,32.571,54.3844,-160.699,"Sand Point, Alaska",,
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,,1,820,at,79,0.879451,172.8,Mi,21.0,54.49,-160.796,Alaska Peninsula,,


In [37]:
# Filter rows where the 'continent' column has null values
null_continent_rows = df[df['continent'].isnull()]


# Print empty rows
null_continent_rows.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude,location,continent,country
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,green,0,657,us,114,7.177,25.0,mww,192.955,-13.8814,167.158,"Sola, Vanuatu",,Vanuatu
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,yellow,0,775,us,92,0.679,40.0,mww,69.727,12.814,-88.1265,"Intipucá, El Salvador",,
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,16-07-2023 06:48,6,6,green,1,860,us,173,0.907,36.0,mww,32.571,54.3844,-160.699,"Sand Point, Alaska",,
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,,1,820,at,79,0.879451,172.8,Mi,21.0,54.49,-160.796,Alaska Peninsula,,
5,"M 6.6 - 277 km NNE of Codrington, Antigua and ...",6.6,10-07-2023 20:28,5,4,green,1,802,us,95,2.454,37.0,mww,10.0,20.0196,-61.0955,"Codrington, Antigua and Barbuda",,


In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1000 non-null   object 
 1   magnitude  1000 non-null   float64
 2   date_time  1000 non-null   object 
 3   cdi        1000 non-null   int64  
 4   mmi        1000 non-null   int64  
 5   alert      449 non-null    object 
 6   tsunami    1000 non-null   int64  
 7   sig        1000 non-null   int64  
 8   net        1000 non-null   object 
 9   nst        1000 non-null   int64  
 10  dmin       1000 non-null   float64
 11  gap        1000 non-null   float64
 12  magType    1000 non-null   object 
 13  depth      1000 non-null   float64
 14  latitude   1000 non-null   float64
 15  longitude  1000 non-null   float64
 16  location   994 non-null    object 
 17  continent  284 non-null    object 
 18  country    651 non-null    object 
dtypes: float64(6), int64(5), object(8)
memory usage: 

In [39]:
# Drop the 'continent' and 'country' and 'location' columns
df.drop(['continent', 'country',"location"], axis=1, inplace=True)

In [40]:
# Define the colors mapping, including handling for NaN values
colors = {
    np.nan: 0,
    "red": 1,
    "yellow": 2,
    "orange": 3,
    "green": 4
}

In [41]:
# Function to map the values
def map_alert_value(value, valid_colors):
    return valid_colors.get(value, 'unknown')

In [42]:
# Apply the function to the 'alert' column
df['alert'] = df['alert'].apply(lambda x: map_alert_value(x, colors))

In [43]:
import json

# List of values for magType
values = ['mww', 'Mi', 'mwc', 'mwb', 'ml', 'mw', 'ms', 'mb', 'md']

# Create a dictionary mapping each value to an enumeration (0 to n)
enum_mapping = {value: idx for idx, value in enumerate(values)}

# Convert the dictionary to a JSON object (optional, for demonstration purposes)
enum_mapping_json = json.dumps(enum_mapping, indent=4)

# Function to map the magType values
def map_mag_type_value(value, valid_mapping):
    return valid_mapping.get(value, 'unknown')

# Apply the function to the 'magType' column
df['magType'] = df['magType'].apply(lambda x: map_mag_type_value(x, enum_mapping))

In [44]:
# Rounding the latitude and longitude columns
df['latitude'] = df['latitude'].round(0)
df['longitude'] = df['longitude'].round(0)

In [45]:
df.head()

Unnamed: 0,title,magnitude,date_time,cdi,mmi,alert,tsunami,sig,net,nst,dmin,gap,magType,depth,latitude,longitude
0,"M 6.5 - 42 km W of Sola, Vanuatu",6.5,16-08-2023 12:47,7,4,4,0,657,us,114,7.177,25.0,0,192.955,-14.0,167.0
1,"M 6.5 - 43 km S of Intipucá, El Salvador",6.5,19-07-2023 00:22,8,6,2,0,775,us,92,0.679,40.0,0,69.727,13.0,-88.0
2,"M 6.6 - 25 km ESE of Loncopué, Argentina",6.6,17-07-2023 03:05,7,5,4,0,899,us,70,1.634,28.0,0,171.371,-38.0,-70.0
3,"M 7.2 - 98 km S of Sand Point, Alaska",7.2,16-07-2023 06:48,6,6,4,1,860,us,173,0.907,36.0,0,32.571,54.0,-161.0
4,M 7.3 - Alaska Peninsula,7.3,16-07-2023 06:48,0,5,0,1,820,at,79,0.879451,172.8,1,21.0,54.0,-161.0


In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   title      1000 non-null   object 
 1   magnitude  1000 non-null   float64
 2   date_time  1000 non-null   object 
 3   cdi        1000 non-null   int64  
 4   mmi        1000 non-null   int64  
 5   alert      1000 non-null   int64  
 6   tsunami    1000 non-null   int64  
 7   sig        1000 non-null   int64  
 8   net        1000 non-null   object 
 9   nst        1000 non-null   int64  
 10  dmin       1000 non-null   float64
 11  gap        1000 non-null   float64
 12  magType    1000 non-null   int64  
 13  depth      1000 non-null   float64
 14  latitude   1000 non-null   float64
 15  longitude  1000 non-null   float64
dtypes: float64(6), int64(7), object(3)
memory usage: 125.1+ KB


In [47]:
# Save the modified DataFrame to a new CSV file
df.to_csv('../data/normalized_data.csv', index=False)