In [None]:
from IPython.display import HTML

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('fivethirtyeight')
%matplotlib inline



In [None]:
# load in data

# Read in the drinks data.
drinks = pd.read_csv('./data/drinks.csv')
drinks['continent'].fillna('NA', inplace=True) # because when reading the file pandas misinterprets "NA" as a null value, not "North America"

# Read in the ufo data.
ufo = pd.read_csv('./data/ufo.csv')
ufo['Time'] = pd.to_datetime(ufo.Time) # don't dwell on these two lines - we'll look at datetimes in detail in DS4
ufo['Year'] = ufo.Time.dt.year

In [None]:
# look at the ufo dataset to understand what it's about
ufo.head()

In [None]:
# Count the number of ufo reports each year (and sort by year).
ufo.Year.value_counts().sort_index()

In [None]:
# Compare with line plot -- UFO sightings by year. (Ordering by year makes sense.)
ufo.Year.value_counts().sort_index().plot(kind='line')


In [None]:
ufo['State']

In [None]:
# COMMON MISTAKE: Don't use a line plot when the x-axis cannot be ordered sensically!

# For example, ordering by continent below shows a trend where no exists ... 
#    it would be just as valid to plot the continents in any order.

# So, a line plot is the wrong type of plot for this data.
# Always think about what you're plotting and if it makes sense.

drinks.continent.value_counts().plot()

In [None]:
# Plot the same data as a bar plot -- a much better choice! We can also change the size
drinks.continent.value_counts().plot(kind='bar', figsize=(10,6)); # 

In [None]:
# Calculate the mean alcohol amounts for each continent.
drinks.groupby('continent').mean()

In [None]:
# plot for a single variable
drinks.groupby('continent')['total_litres_of_pure_alcohol'].mean().plot(kind='bar');

In [None]:
# Side-by-side bar plots - if we don't specify a column, pandas will plot bars (or lines) of all columns on the same axes
drinks.groupby('continent').mean().plot(kind='bar');

In [None]:
# histogram to show the distribution of values
# we divide the values of the column into "bins", then the column values go on the x-axis and over each bin is a bar whose height is the number of records in that bin
drinks['beer_servings'].plot(kind='hist')

In [None]:
# Try more bins — it takes the range of the data and divides it into 20 evenly spaced bins.
drinks['beer_servings'].plot(kind='hist', bins=20);

In [None]:
# Compare two variables with a scatter plot.
drinks.plot(kind='scatter', x='beer_servings', y='wine_servings');

In [None]:
# Add transparency (can be useful in scatter plots with lots of tightly clustered points)
drinks.plot(kind='scatter', x='beer_servings', y='wine_servings', alpha=0.3);

In [None]:
# VERY brief intro to another library for more advanced plotting
import seaborn as sns
sns.pairplot(drinks)

In [None]:
drinks_correlations = drinks.corr()
sns.heatmap(drinks_correlations)