In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Importing Atlantic Hurricane data

In [None]:
df = pd.read_csv('/Users/danielbrechner/hurrdat_co2/assets/atlantic.csv')
df.head()

The naming convention was applied to hurricanes started in 1950. This might indicate better data quality, so we will use hurricane data from 1950 to the most recent year in the dataset

In [None]:
#boolean indexing to df
hurricanes = df[df['Date'] >= 19500000]
hurricanes.head()

Dropping unnecessary columns

In [None]:
hurricanes = hurricanes.drop(columns=['ID', 'Time', 'Event', 'Low Wind NE', 'Low Wind SE', 'Low Wind SW', 'Low Wind NW', 'Moderate Wind NE', 'Moderate Wind SE', 'Moderate Wind SW', 'Moderate Wind NW', 'High Wind NE', 'High Wind SE', 'High Wind SW', 'High Wind NW'], axis = 1, inplace=True)


In [None]:
hurricanes.head()

In [None]:
hurricanes.info()

Converting to datetime 

In [None]:
hurricanes.loc[:, 'Date'] = pd.to_datetime(hurricanes['Date'], format='%Y-%m-%d').dt.date

In [None]:
hurricanes.dtypes

In [None]:
#The Date column is still showing up as an object because '.dt.date' returns an object, not a pandas datetime object. So we must run 'pd.to_datetime' again

hurricanes['Date'] = pd.to_datetime(hurricanes['Date'])


No null values are found but negative values such as -999 for wind represents null. Lets drop these values from the dataset and convert to NaN.

In [None]:
cols_to_check = ['Maximum Wind', 'Minimum Pressure']
negative_mask = (hurricanes[cols_to_check] < 0)
hurricanes[cols_to_check] = hurricanes[cols_to_check].mask(negative_mask, np.nan)


In [None]:
negative_exist = ((hurricanes['Maximum Wind'] < 0) | (hurricanes['Minimum Pressure'] < 0)).any()
print("The number of negative values in the Maximum Wind and Minimum Pressure columns are:", negative_exist.sum())

Creating a new column for years and months

In [None]:
hurricanes['Year'] = hurricanes['Date'].dt.year
hurricanes['Month'] = hurricanes['Date'].dt.month
hurricanes.head()

In [None]:
max_wind_speeds = hurricanes.loc[hurricanes.groupby('Name')['Maximum Wind'].idxmax(), ['Name', 'Maximum Wind']]
max_wind_speeds


Dropping duplicate storms

In [None]:
atlantic_df = atlantic.sort_values(by='Maximum Wind', ascending=False)
atlantic_df = atlantic.drop_duplicates(subset='Name', keep="first")
atlantic_df.head()

In [None]:
atlantic_df.shape

In [None]:
atlantic_df = atlantic_df.sort_values(by="Date", ascending=True)
atlantic_df.head()