# Import Modules

In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Reading in data
Pandas has multiple methods for reading in data in a wide array of formats. We will use the `read_csv()` method to read in data contained in a csv file. You can find methods to reading many different formats using Pandas [here](https://pandas.pydata.org/docs/reference/io.html).

In [2]:
# Read csv using the read_csv() method
data = pd.read_csv("../Data/GlobalLandTemperaturesByCity.csv")
# Display the first 4 rows of data using the head() method
data.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


# Accessing Pandas DataFrame Attributes
Attributes in python are variables that contain data specific to an initialized object. You access attributes using the "." notation. For example, object.attributeVariable. Below we go through some of the commonly used attributes specific to initialized Pandas DataFrames, but you can view a all the attributes available [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) after minimizing the DataFrame methods list. (The attributes begin at DataFrame.index)

In [3]:
# Check the dimensions of the data using the .shape attribute
data.shape

(8599212, 7)

In [4]:
# Check the column names using the .columns attribute
data.columns

Index(['dt', 'AverageTemperature', 'AverageTemperatureUncertainty', 'City',
       'Country', 'Latitude', 'Longitude'],
      dtype='object')

In [5]:
# Check each columns data type usng the .dtypes attribute
data.dtypes

dt                                object
AverageTemperature               float64
AverageTemperatureUncertainty    float64
City                              object
Country                           object
Latitude                          object
Longitude                         object
dtype: object

# Selecting Subsets of Data
In pandas there are several ways to filter and take different subsets of your data. Below we walk through some of the most common ways to do so, but you can learn more with the examples provided [here](https://pandas.pydata.org/docs/getting_started/intro_tutorials/03_subset_data.html)

In [6]:
# Select columns by passing list [] of column names 
columns = ["City", "Country", "dt"]
data[columns].head()

Unnamed: 0,City,Country,dt
0,Århus,Denmark,1743-11-01
1,Århus,Denmark,1743-12-01
2,Århus,Denmark,1744-01-01
3,Århus,Denmark,1744-02-01
4,Århus,Denmark,1744-03-01


In [7]:
# Select a slice of the data using the iloc method
data.iloc[30:41, :] # Rows 30 -> 40 (41 not included) and all columns

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
30,1746-05-01,,,Århus,Denmark,57.05N,10.33E
31,1746-06-01,,,Århus,Denmark,57.05N,10.33E
32,1746-07-01,,,Århus,Denmark,57.05N,10.33E
33,1746-08-01,,,Århus,Denmark,57.05N,10.33E
34,1746-09-01,,,Århus,Denmark,57.05N,10.33E
35,1746-10-01,,,Århus,Denmark,57.05N,10.33E
36,1746-11-01,,,Århus,Denmark,57.05N,10.33E
37,1746-12-01,,,Århus,Denmark,57.05N,10.33E
38,1747-01-01,,,Århus,Denmark,57.05N,10.33E
39,1747-02-01,,,Århus,Denmark,57.05N,10.33E


# Converting Data Types
In Pandas we can convert data types to reflect the variables more accurately. One way of doing this is to use provided Pandas module functions that take in a DataFrame column (also called a Pandas Series) and then converts the data to the specified new data type. A second is to use the `astype()` method. Below we will use the first method described, but you can view examples of using the second method [here](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html).

In [8]:
# Convert the string dates to datetime objects using the to_datetime function
data['dt'] = pd.to_datetime(data['dt'])
data.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
City                                     object
Country                                  object
Latitude                                 object
Longitude                                object
dtype: object

In [9]:
# Convert the City and Country variable to categorical types
data["City"] = pd.Categorical(data["City"])
data["Country"] = pd.Categorical(data["Country"])
data.dtypes

dt                               datetime64[ns]
AverageTemperature                      float64
AverageTemperatureUncertainty           float64
City                                   category
Country                                category
Latitude                                 object
Longitude                                object
dtype: object

In [10]:
# Clean up the latitude and longitude variables to be able to convert them to floats
splitLatitude = data["Latitude"].str.split("(?!.*\d)", regex=True)
splitLongitude = data["Longitude"].str.split("(?!.*\d)", regex=True)
data["Latitude"] = [float(x[0]) if x[1] == "N" else float(x[0])*(-1) for x in splitLatitude]
data["Longitude"] = [float(x[0]) if x[1] == "E" else float(x[0])*(-1) for x in splitLongitude]

In [11]:
data.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05,10.33
1,1743-12-01,,,Århus,Denmark,57.05,10.33
2,1744-01-01,,,Århus,Denmark,57.05,10.33
3,1744-02-01,,,Århus,Denmark,57.05,10.33
4,1744-03-01,,,Århus,Denmark,57.05,10.33


# Generating Summary Statistics

In [12]:
# Descriptive statistics using the describe() method
data.iloc[:, :-2].describe(include="all", datetime_is_numeric=True)

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country
count,8599212,8235082.0,8235082.0,8599212,8599212
unique,,,,3448,159
top,,,,Springfield,India
freq,,,,9545,1014906
mean,1907-10-21 07:06:00.828411904,16.72743,1.028575,,
min,1743-11-01 00:00:00,-42.704,0.034,,
25%,1860-06-01 00:00:00,10.299,0.337,,
50%,1911-09-01 00:00:00,18.831,0.591,,
75%,1962-09-01 00:00:00,25.21,1.349,,
max,2013-09-01 00:00:00,39.651,15.396,,


In [13]:
# More information including missing value counts
data.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Non-Null Count    Dtype         
---  ------                         --------------    -----         
 0   dt                             8599212 non-null  datetime64[ns]
 1   AverageTemperature             8235082 non-null  float64       
 2   AverageTemperatureUncertainty  8235082 non-null  float64       
 3   City                           8599212 non-null  category      
 4   Country                        8599212 non-null  category      
 5   Latitude                       8599212 non-null  float64       
 6   Longitude                      8599212 non-null  float64       
dtypes: category(2), datetime64[ns](1), float64(4)
memory usage: 361.0 MB


# Generating Visualizations
We can create interactive figures using plotly, relatively easily. Plotly has an express module for quick an simple figures and a graph object module that can be used for more customizability. You can find a gallery of plotly figures [here](https://plotly.com/python/).

In [14]:
boxData = data[data["Country"].isin(["Denmark", "Lebanon"])]
boxFig = px.box(boxData, y = "AverageTemperature", color = "Country", points="all", title="Average Temperature Plot")
boxFig

In [15]:
lineData = data[data["Country"]=="Brazil"].resample("Y", on="dt")["AverageTemperature"].mean().reset_index()
lineFig = px.line(lineData, x = "dt", y = "AverageTemperature", title = "Yearly Average Temperature in Brazil")
lineFig.update_layout(xaxis_title = "Year", yaxis_title = "Average Temperature")
lineFig

In [16]:
mapData = (
    data[(data["Country"]=="United States")&(data["dt"].dt.year > 2008)]
        .set_index("dt")
        .groupby(["Country", "City", "Latitude", "Longitude"])
        .resample("Y")["AverageTemperature"]
        .mean()
        .reset_index()
)

mapFig = go.Figure()

mapFig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = mapData[mapData["dt"].dt.year==2009]["Longitude"],
        lat = mapData[mapData["dt"].dt.year==2009]["Latitude"],
        text = ["City: {}<br>Avg Temp: {}".format(x, y) for x, y in zip(mapData[mapData["dt"].dt.year==2009]["City"], mapData[mapData["dt"].dt.year==2009]["AverageTemperature"])],
        marker = dict(
            size = 10 + mapData[mapData["dt"].dt.year==2009]["AverageTemperature"].values,
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ), name = "Avg Temp 2009"))

mapFig.update_layout(
        title_text = 'Average Temperature in the United States; Year of 2009',
        showlegend = True,
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )

mapFig.show()

# Bonus: Animated Figure

In [17]:
# Instantiate a dictionary that will hold our figure data and configurations
mapFig_dict = {
    "data": [],
    "layout": {},
    "frames": []
}

# Define the layout of the figure
mapFig_dict["layout"]["updatemenus"] = [
    {
        "buttons": [
            {
                "args": [None, {"frame": {"duration": 500, "redraw": False},
                                "fromcurrent": True, "transition": {"duration": 300,
                                                                    "easing": "quadratic-in-out"}}],
                "label": "Play",
                "method": "animate"
            },
            {
                "args": [[None], {"frame": {"duration": 0, "redraw": False},
                                  "mode": "immediate",
                                  "transition": {"duration": 0}}],
                "label": "Pause",
                "method": "animate"
            }
        ],
        "direction": "left",
        "pad": {"r": 10, "t": 87},
        "showactive": False,
        "type": "buttons",
        "x": 0.1,
        "xanchor": "right",
        "y": 0,
        "yanchor": "top"
    }
]

sliders_dict = {
    "active": 0,
    "yanchor": "top",
    "xanchor": "left",
    "currentvalue": {
        "font": {"size": 20},
        "prefix": "Year:",
        "visible": True,
        "xanchor": "right"
    },
    "transition": {"duration": 300, "easing": "cubic-in-out"},
    "pad": {"b": 10, "t": 50},
    "len": 0.9,
    "x": 0.1,
    "y": 0,
    "steps": []
}

# Package the data by year
## First year figure
firstYear = 2009
map_firstYear = go.Scattergeo(
        locationmode = 'USA-states',
        lon = mapData[mapData["dt"].dt.year==firstYear]["Longitude"],
        lat = mapData[mapData["dt"].dt.year==firstYear]["Latitude"],
        text = ["City: {}<br>Avg Temp: {}".format(x, y) for x, y in zip(mapData[mapData["dt"].dt.year==firstYear]["City"], mapData[mapData["dt"].dt.year==firstYear]["AverageTemperature"])],
        marker = dict(
            size = 10 + mapData[mapData["dt"].dt.year==firstYear]["AverageTemperature"].values,
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ))
mapFig_dict["data"].append(map_firstYear)

## figure for remaining years
years = [x for x in mapData["dt"].dt.year.unique()]

for year in years:
    byYearFig = {"data":[go.Scattergeo(
        locationmode = 'USA-states',
        lon = mapData[mapData["dt"].dt.year==year]["Longitude"],
        lat = mapData[mapData["dt"].dt.year==year]["Latitude"],
        text = ["City: {}<br>Avg Temp: {}".format(x, y) for x, y in zip(mapData[mapData["dt"].dt.year==year]["City"], mapData[mapData["dt"].dt.year==year]["AverageTemperature"])],
        marker = dict(
            size = 10 + mapData[mapData["dt"].dt.year==year]["AverageTemperature"].values,
            line_color='rgb(40,40,40)',
            line_width=0.5,
            sizemode = 'area'
        ))], "name":str(year)}
    mapFig_dict["frames"].append(byYearFig)

    # Define the slider steps
    slider_step = {"args": [
            [str(year)],
            {"frame": {"duration": 300, "redraw": False},
            "mode": "immediate",
            "transition": {"duration": 300}}
        ],
            "label": str(year),
            "method": "animate"}
    # Add each step to the configuration dictionary
    sliders_dict["steps"].append(slider_step)

# Add the slider to the figure layout
mapFig_dict["layout"]["sliders"] = [sliders_dict]
# Generate the figure
mapFig = go.Figure(mapFig_dict)
mapFig.update_layout(
        title_text = 'Average Temperature in the United States',
        showlegend = False,
        geo = dict(
            scope = 'usa',
            landcolor = 'rgb(217, 217, 217)',
        )
    )
mapFig