<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Libraries" data-toc-modified-id="Libraries-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Libraries</a></span></li><li><span><a href="#General-info" data-toc-modified-id="General-info-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>General info</a></span></li><li><span><a href="#Visualization" data-toc-modified-id="Visualization-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Visualization</a></span><ul class="toc-item"><li><span><a href="#By-month" data-toc-modified-id="By-month-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>By month</a></span></li></ul></li><li><span><a href="#Preparing-data" data-toc-modified-id="Preparing-data-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Preparing data</a></span><ul class="toc-item"><li><span><a href="#Create-new-dataframe-with-an-index-for-each-month" data-toc-modified-id="Create-new-dataframe-with-an-index-for-each-month-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Create new dataframe with an index for each month</a></span></li><li><span><a href="#Merge-datasets" data-toc-modified-id="Merge-datasets-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Merge datasets</a></span></li></ul></li></ul></div>

# Libraries

In [41]:
import sys
sys.path.append("../")

import pandas as pd
import calendar
from datetime import datetime
import plotly.express as px
from plotly.subplots import make_subplots # to make subplots
import plotly.graph_objects as go # to make subplots

import src.cleaning as cl

# General info 

The temperature data represents temperature anomalies (differences from the mean/expected value) per month and per season (DJF=Dec-Feb, MAM=Mar-May, etc). We will not be working with absolute temperature data as in climate change studies, anomalies are more important than absolute temperature.

For more info [here](https://data.giss.nasa.gov/gistemp/)

https://towardsdatascience.com/time-series-analysis-and-climate-change-7bb4371021e

In [2]:
temp = pd.read_csv("../Data/SST_Global.csv")
temp.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Land-Ocean: Global Means
Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
1880,-.29,-.18,-.11,-.20,-.12,-.23,-.21,-.09,-.16,-.23,-.20,-.23,-.19,***,***,-.14,-.18,-.20
1881,-.16,-.17,.04,.04,.02,-.20,-.07,-.03,-.14,-.21,-.22,-.11,-.10,-.11,-.18,.03,-.10,-.19
1882,.14,.15,.03,-.19,-.16,-.26,-.21,-.06,-.10,-.25,-.16,-.25,-.11,-.10,.06,-.10,-.17,-.17
1883,-.32,-.39,-.13,-.17,-.20,-.13,-.08,-.15,-.21,-.14,-.22,-.16,-.19,-.20,-.32,-.17,-.12,-.19


In [3]:
df = cl.cleaning_temp_data(temp)

In [4]:
df.head(2)

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
1,1880,-0.29,-0.18,-0.11,-0.2,-0.12,-0.23,-0.21,-0.09,-0.16,-0.23,-0.2,-0.23,-0.19,***,***,-0.14,-0.18,-0.2
2,1881,-0.16,-0.17,0.04,0.04,0.02,-0.2,-0.07,-0.03,-0.14,-0.21,-0.22,-0.11,-0.1,-.11,-.18,0.03,-0.1,-0.19


In [None]:
df2 = df.groupby

# Visualization 

More documentation about axes [here](https://plotly.com/python/axes/)

To see the axis:
```python
fig.update_xaxes(showline = True, linewidth = 1, linecolor = "black")
```

## By month 

In [115]:
df.columns

Index(['Year', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
       'Oct', 'Nov', 'Dec', 'J-D', 'D-N', 'DJF', 'MAM', 'JJA', 'SON'],
      dtype='object')

In [118]:
fig = make_subplots(rows = 4, cols = 3, subplot_titles=('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep',
       'Oct', 'Nov', 'Dec'))
count = 1
count2 = 1
count3 = 1
count4 = 1
for i in df.columns[1:13]:
    if count <= 3:
        fig.add_trace(
            go.Scatter(x = df.Year, y = df[i]), row = 1, col = count)
        count += 1
    elif count > 3 and count2 <= 3:
        fig.add_trace(
            go.Scatter(x = df.Year, y = df[i]), row = 2, col = count2)
        count2 += 1
        
    elif count > 3 and count2 > 3 and count3 <= 3:
        fig.add_trace(
            go.Scatter(x = df.Year, y = df[i]), row = 3, col = count3)
        count3 +=1
    else:
        fig.add_trace(
            go.Scatter(x = df.Year, y = df[i]), row = 4, col = count4)
        count4 +=1
        
fig.update_layout(height=1000, width=1000, title_text="Evolution of temperature")
fig.update_layout(
    font_family="Garamond",
    font_color="black",
    font_size = 16,
    title_font_family="Times New Roman",
    title_font_color="green",
    legend_title_font_color="green"
)

# to change the line color and width
fig.update_traces(line=dict(color = "Black", width = 0.3))

# to change the axes 
fig.update_xaxes(tickangle = -45, ticks = "outside", showgrid = False, showline = True, linewidth = 1, linecolor = "black", mirror = True)
fig.update_yaxes(ticks = "outside", showgrid = False, showline = True, linewidth = 1, linecolor = "black", mirror = True)

fig.show()

# Preparing data 

## Create new dataframe with an index for each month

**Frequency Aliases**

Some of the most common are:

- "D" : Day
- "W" : Week
- "H" : Hour
- "T" : Minute
- "S" : Second
- "L" : Milisecond

In [6]:
# create the date range
date_rng = pd.date_range(start='1/1/1880', end='1/03/2019', freq='M')
date_rng[1]

Timestamp('1880-02-29 00:00:00', freq='M')

In [7]:
# Next create the empty DataFrame, which we will populate using the actual data
new_df = pd.DataFrame(date_rng, columns = ["date"])
new_df.head()

Unnamed: 0,date
0,1880-01-31
1,1880-02-29
2,1880-03-31
3,1880-04-30
4,1880-05-31


In [8]:
# Create a column for the anomoly values
new_df["Avg_anomalies"] = None
new_df.head()

Unnamed: 0,date,Avg_anomalies
0,1880-01-31,
1,1880-02-29,
2,1880-03-31,
3,1880-04-30,
4,1880-05-31,


In [9]:
# En times series la fecha debe estar como indice
new_df.set_index("date", inplace = True)
new_df.head()

Unnamed: 0_level_0,Avg_anomalies
date,Unnamed: 1_level_1
1880-01-31,
1880-02-29,
1880-03-31,
1880-04-30,
1880-05-31,


In [10]:
# puede que no sepamos que frecuencia tienen nuestros datos, en pandas tambien lo podemos inferir
dates = pd.date_range(start='1/1/1880', end='1/03/2019',periods=29)
freq = pd.infer_freq(dates)
print(freq)

43518H


## Merge datasets 

In [11]:
# First select only the data that we want
df.head()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,J-D,D-N,DJF,MAM,JJA,SON
1,1880,-0.29,-0.18,-0.11,-0.2,-0.12,-0.23,-0.21,-0.09,-0.16,-0.23,-0.2,-0.23,-0.19,***,***,-0.14,-0.18,-0.2
2,1881,-0.16,-0.17,0.04,0.04,0.02,-0.2,-0.07,-0.03,-0.14,-0.21,-0.22,-0.11,-0.1,-.11,-.18,0.03,-0.1,-0.19
3,1882,0.14,0.15,0.03,-0.19,-0.16,-0.26,-0.21,-0.06,-0.1,-0.25,-0.16,-0.25,-0.11,-.10,.06,-0.1,-0.17,-0.17
4,1883,-0.32,-0.39,-0.13,-0.17,-0.2,-0.13,-0.08,-0.15,-0.21,-0.14,-0.22,-0.16,-0.19,-.20,-.32,-0.17,-0.12,-0.19
5,1884,-0.16,-0.08,-0.37,-0.43,-0.37,-0.41,-0.35,-0.26,-0.27,-0.24,-0.3,-0.29,-0.29,-.28,-.13,-0.39,-0.34,-0.27


In [12]:
df2 = df.iloc[:,:13]
df2.head()

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
1,1880,-0.29,-0.18,-0.11,-0.2,-0.12,-0.23,-0.21,-0.09,-0.16,-0.23,-0.2,-0.23
2,1881,-0.16,-0.17,0.04,0.04,0.02,-0.2,-0.07,-0.03,-0.14,-0.21,-0.22,-0.11
3,1882,0.14,0.15,0.03,-0.19,-0.16,-0.26,-0.21,-0.06,-0.1,-0.25,-0.16,-0.25
4,1883,-0.32,-0.39,-0.13,-0.17,-0.2,-0.13,-0.08,-0.15,-0.21,-0.14,-0.22,-0.16
5,1884,-0.16,-0.08,-0.37,-0.43,-0.37,-0.41,-0.35,-0.26,-0.27,-0.24,-0.3,-0.29


In [13]:
def populate_df_with_anomolies_from_row(row):
    year = row['Year']
    # Anomaly values (they seem to be a mixture of strings and floats)
    monthly_anomolies = row.iloc[1:]
    print(monthly_anomolies)
    # Abbreviated month names (index names)
    months = monthly_anomolies.index
    print(months)
    for month in monthly_anomolies.index:
        # Get the last day for each month 
        last_day = calendar.monthrange(year,datetime.strptime(month, '%b').month)[1]
        # construct the index with which we can reference our new DataFrame (to populate) 
        date_index = datetime.strptime(f'{year} {month} {last_day}', '%Y %b %d')
        # Populate / set value @ above index, to anomaly value
        t.loc[date_index] = monthly_anomolies[month]

# Apply function to each row of raw data 
_ = df2.apply(lambda row: populate_df_with_anomolies_from_row(row), axis=1)

Jan    -.29
Feb    -.18
Mar    -.11
Apr    -.20
May    -.12
Jun    -.23
Jul    -.21
Aug    -.09
Sep    -.16
Oct    -.23
Nov    -.20
Dec    -.23
Name: 1, dtype: object
Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct',
       'Nov', 'Dec'],
      dtype='object')


TypeError: '<=' not supported between instances of 'int' and 'str'