In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.express as px
import calendar


# Importing Atlantic Hurricane data

### Feature 1. Read TWO data files (JSON,CSV, Excel, etc.).

In [None]:
co2 = pd.read_csv('assets/co2_small.csv')
co2.head()


In [None]:
df = pd.read_csv('assets/atlantic.csv')
df.head()

In [None]:
df.shape

The naming convention was applied to hurricanes starting in 1950. This might indicate better data quality, so we will use hurricane data from 1950 to the most recent year in the dataset

In [None]:
#boolean indexing to df
hurricanes = df[df['Date'] >= 19500000]
hurricanes.head()

In [None]:
hurricanes.shape

Dropping unnecessary columns

In [None]:
hurricanes = hurricanes.drop(columns=['ID', 'Time', 'Event', 'Low Wind NE', 'Low Wind SE', 'Low Wind SW', 'Low Wind NW', 'Moderate Wind NE', 'Moderate Wind SE', 'Moderate Wind SW', 'Moderate Wind NW', 'High Wind NE', 'High Wind SE', 'High Wind SW', 'High Wind NW'], axis = 1,)

In [None]:
hurricanes.head()

In [None]:
hurricanes.info()

Converting to datetime 

In [None]:
hurricanes['Date'] = pd.to_datetime(hurricanes['Date'].astype(str), format = '%Y %m %d')
hurricanes

Adding seperate year and month columns

In [None]:
hurricanes['Year'] = hurricanes['Date'].map(lambda x: x.year)
hurricanes['Month'] = hurricanes['Date'].map(lambda x: x.month)
hurricanes.head()

In [None]:
print(hurricanes['Year'].unique())

In [None]:
print(hurricanes['Month'].unique())

According to this dataset, no values for March exist

In [None]:
hurricanes.dtypes

No null values are found but negative values such as -999 for wind represents null. Lets drop these values from the dataset and convert to NaN.

In [None]:
cols_to_check = ['Maximum Wind', 'Minimum Pressure']
negative_mask = (hurricanes[cols_to_check] < 0)
hurricanes[cols_to_check] = hurricanes[cols_to_check].mask(negative_mask, np.nan)


In [None]:
negative_exist = ((hurricanes['Maximum Wind'] < 0) | (hurricanes['Minimum Pressure'] < 0)).any()
print("The number of negative values in the Maximum Wind and Minimum Pressure columns are:", negative_exist.sum())

Dropping hemisphere tags from coordinates

In [None]:
hurricanes['Latitude'] = hurricanes['Latitude'].str[:-1].astype(float)
hurricanes['Longitude'] = hurricanes['Longitude'].str[:-1].astype(float)

In [None]:
print(hurricanes[['Latitude', 'Longitude']].head())

Grouping storms by Max Wind Speed

In [None]:
max_wind_speeds = hurricanes.loc[hurricanes.groupby('Name')['Maximum Wind'].idxmax(), ['Name', 'Maximum Wind', 'Year', 'Month', 'Latitude', 'Longitude']]
max_wind_speeds


In [None]:
max_wind_speeds.to_csv('assets/hurr_wind_speed.csv', index=False)

# Classifying hurricanes by the Saffir-Simpson Hurricane Wind Scale (category 1-5)

In [None]:
max_wind_speeds['category'] = None

for index, row in max_wind_speeds.iterrows():
    wind_speed = row['Maximum Wind']
    if wind_speed >= 74 and wind_speed <= 95:
        max_wind_speeds.at[index, 'category'] = 1
    elif wind_speed >= 96 and wind_speed <= 110:
        max_wind_speeds.at[index, 'category'] = 2
    elif wind_speed >= 111 and wind_speed <= 129:
        max_wind_speeds.at[index, 'category'] = 3
    elif wind_speed >= 130 and wind_speed <= 156:
        max_wind_speeds.at[index, 'category'] = 4
    elif wind_speed >= 157:
        max_wind_speeds.at[index, 'category'] = 5
        

max_wind_speeds.head()

Dropping hurricanes under category 1

In [None]:
max_wind_speeds.dropna(subset=['category'], inplace=True)

In [None]:
max_wind_speeds

### Feature 3. Visualize your data. Make 3 matplotlib or seaborn visualizations to display your data.

Looking at the distribution of hurricanes by category 

In [None]:
category_counts = max_wind_speeds["category"].value_counts()
fig = px.bar(
    x=category_counts.index,
    y=category_counts.values,
    labels={"x": "Category", "y": "Count"}
)

fig.show()


Interestingly, there are more Category 3 hurricanes than Category 2

Saving max_wind_speeds DF to a new csv

In [None]:
max_wind_speeds.to_csv('assets/hurr_categories.csv', index=False)

In [None]:
max_wind_speeds["category"].unique()

Sorting the categories for plotting

In [None]:
category_order = [1, 2, 3, 4, 5]

color_scale = [
    (0, 'blue'),
    (0.25, 'green'),
    (0.5, 'yellow'),
    (0.75, 'orange'),
    (1, 'red')
]

fig = px.scatter(
    max_wind_speeds,
    x="Year",
    y="Maximum Wind",
    color="category",
    color_continuous_scale=color_scale,
    category_orders={"category": category_order}
)

fig.show()

Looking at the distribution of categories by month

In [None]:
category_order = [1, 2, 3, 4, 5]

color_scale = [
    (0, 'blue'),
    (0.25, 'green'),
    (0.5, 'yellow'),
    (0.75, 'orange'),
    (1, 'red')
]

grouped = max_wind_speeds.groupby(["Month", "category"]).size().reset_index(name="count")

month_names = [calendar.month_name[i] for i in range(1, 13)]

fig = px.bar(
    grouped,
    x="Month",
    y="count",
    color="category",
    color_discrete_sequence=color_scale,
    category_orders={"category": category_order},
    barmode="stack"
)

fig.update_layout(
    xaxis=dict(
        ticktext=month_names,
        tickvals=list(range(1, 13))
    ),
    title={
        'text': "Distribution of categories by month",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    }
)

fig.show()

We can see how the majority of hurricanes, and specifically higher intensity storms occur between the months of August to October

In [None]:
x = max_wind_speeds['Year']
y = max_wind_speeds['Maximum Wind']

fig = go.Figure()
fig.add_trace(go.Scatter(x=x, y=y, mode='markers'))

fig.update_layout(title='Maximum Wind Speeds for Each Hurricane By Year', xaxis_title='Year', yaxis_title='Maximum Wind Speed (mph)')

fig.show()

# Looking at the distribution of Storms per year

In [None]:
storms_per_year = max_wind_speeds.groupby('Year').size().reset_index(name='Number of storms')
max_wind_speeds_with_count = pd.merge(max_wind_speeds, storms_per_year, on='Year')

max_wind_speeds_with_count.head()

In [None]:
x = storms_per_year['Year']
y = storms_per_year['Number of storms']

fig = go.Figure()
fig.add_trace(go.Bar(x=x, y=y, marker_color='blue'))
fig.update_layout(title='Number of Hurricanes per Year', xaxis_title='Year', yaxis_title='Number of Hurricanes')

fig.show()

Since we are looking at hurricane systems that are above category 1, the bar graph above has missing years because the respective years did produce storms of a miminum intensity for analysis. 

# Merging the datasets 

### Feature 2. Clean your data and perform a pandas merge with your two data sets, then calculate some new values based on the new data set.

In [None]:
co2.columns
max_wind_speeds.columns

In [None]:
merged_df = pd.merge(co2, max_wind_speeds, on='Year', how='outer')
merged_df

In [None]:
from IPython.display import Image
Image(filename='Sheet 1.png')

### Feature 5. Interpretation of your data.

The graph shows two sets of data: the first is the concentration of carbon dioxide (CO2) in the atmosphere over time, and the second is the number of hurricanes per year over the same time period. The graph indicates that the concentration of CO2 in the atmosphere has been steadily increasing over time, which is represented by the upward trend in the blue line.

The graph also shows the number of hurricanes per year, which is represented by the bars. The distribution of the number of hurricanes appears to be increasing overall, as the bars get taller over time. However, the distribution is not smooth and the bars do not follow the same pattern as the CO2 data.

The graph highlights that in recent years, from 1995 to 2015, there appears to be an increase in the number of powerful hurricane systems, which may be correlated to the rising concentration of CO2 levels. This can be seen in the taller bars during this time period, indicating an increase in the number of strong hurricanes.

Overall, the graph suggests that the concentration of CO2 in the atmosphere is steadily increasing, and that there may be a correlation between this increase and the number of powerful hurricane systems in recent years. However, it is important to note that the distribution of the number of hurricanes is not a direct reflection of CO2 concentrations, and other factors such as weather patterns and ocean currents may also play a role in the frequency and intensity of hurricanes.

### Feature 4. Best Practices. Utilize a virtual environment and include instructions in your README on how the user should set one up