In [None]:
1+1

In [None]:
b = 2

In [None]:
b + b

# Data Analysis Notebook Introduction

this is a markdown cell, in here I am creating text to format my notebook.

In this cell I can add links to [how to improve your markdown writing](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#links)

![donkey and pig image](donkey_pig.jpg)

# NumPy

NumPy is a powerful library for performing data analysis

In [2]:
import numpy as np # canonical import

In [None]:
my_list = [[1, 2, 3], [4, 5, 6]]

display(type(my_list))
my_list

In [None]:
my_array = np.array([[1.1, 2, 3], [4, 5, 6]])

display(type(my_array))
my_array

---
Exploring Attributes

In [None]:
my_array.ndim

In [None]:
my_array.shape

In [None]:
my_array.size

In [None]:
my_array.dtype

## Slicing

🤔 How can we select specific values from a **ROW** ?

In [3]:
data_list = [
    [ 0,  1,  2,  3,  4],
    [10, 11, 12, 13, 14],
    [20, 21, 22, 23, 24],
    [30, 31, 32, 33, 34],
    [40, 41, 42, 43, 44],
]

data_np = np.array(data_list)
data_np

array([[ 0,  1,  2,  3,  4],
       [10, 11, 12, 13, 14],
       [20, 21, 22, 23, 24],
       [30, 31, 32, 33, 34],
       [40, 41, 42, 43, 44]])

In [5]:
# pure Python
data_list[2][1:4]

[21, 22, 23]

In [6]:
# using NumPy
data_np[2,1:4]

array([21, 22, 23])

🤔 How can we select specific values from a **COLUMN** ?

In [7]:
# Pure Python
selection = []

for index, row in enumerate(data_list):
    if index > 0:
        selection.append(row[4])

selection

[14, 24, 34, 44]

In [10]:
# using NumPy - selecting all the column 4
data_np[2:,4]

array([24, 34, 44])

In [11]:
array = np.arange(0, 10)
array

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

---
General Syntax

array[**start:stop** *:step*]

In [12]:
array[0:20:1]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

## Vectorized Operations

In [13]:
my_list = [
    [6, 5],
    [1, 3],
    [5, 6],
    [1, 4],
    [3, 7],
    [5, 8],
    [3, 5],
    [8, 4],
]

In [None]:
# Pure Python
sums = []

for row in my_list:
    #elements 0 and 1 in each row (list)
    sums.append(row[0]+row[1])

sums

In [None]:
# Using NumPy
my_np = np.array(my_list)

In [None]:
my_np

In [None]:
my_np[:,0] + my_np[:,1]

In [None]:
my_np.sum(axis=1)

In [None]:
my_np.sum(axis=0)

In [None]:
#Other notation
np.sum(my_np, axis = 0)

In [None]:
my_np + 1

**How much faster is numPy?**

In [None]:
# 2D-array of shape (10_000, 10_000) with random floats in the interval [0, 1]. That's 100M numbers!
my_array = np.random.rand(10000, 10000)
array_list = my_array.tolist()
#needs to be a list to show the python loop
my_array.size

In [None]:
%%time
total = 0

for row in array_list:
    for number in row:
        total += number

round(total, 2)

In [None]:
%%time
round(np.sum(my_array), 2)

## Boolean Index

In [None]:
my_np

In [None]:
my_np>4

In [None]:
my_np[my_np > 4]

In [None]:
my_np[my_np > 4] = 10
my_np

# Pandas

Manipulating data of many different types

In [14]:
import pandas as pd # canonical import

## Data Series

Used to define a single column of data

In [15]:
my_series = pd.Series(data=[1, 2, 'three'])
# my_series = pd.Series({'id1': 1, 'id2': 2, 'id3': 'three'})

my_series

0        1
1        2
2    three
dtype: object

## DataFrame

Creates a series of columns

In [16]:
df = pd.DataFrame(
    [[4, 7, 10],
    [5, 8, 11],
    [6, 9, 12]],
    index=['row_1', 'row_2', 'row_3'],
    columns=['col_a', 'col_b', 'col_c']
)

df

Unnamed: 0,col_a,col_b,col_c
row_1,4,7,10
row_2,5,8,11
row_3,6,9,12


In [17]:
# selecting rows with their index number
# Uses integer position — returns the first row (row at position 0)
# Shorter version of df.iloc[0:1]
df[0:1]

Unnamed: 0,col_a,col_b,col_c
row_1,4,7,10


In [18]:
# Uses label-based indexing — returns rows with index labels 0 and 1 (inclusive)
df.loc['row_1':'row_3']

Unnamed: 0,col_a,col_b,col_c
row_1,4,7,10
row_2,5,8,11
row_3,6,9,12


In [19]:
df.loc['row_1':'row_3', ['col_a', 'col_b']]

Unnamed: 0,col_a,col_b
row_1,4,7
row_2,5,8
row_3,6,9


In [20]:
# labels here are the name of the row's index
print(df.index)

Index(['row_1', 'row_2', 'row_3'], dtype='object')


In [21]:
#dataframe have much better tables
apples  = pd.Series(data=[1, 2, 3], index=['id1', 'id2', 'id3'])
oranges = pd.Series(data=[4, 5, 6], index=['id1', 'id2', 'id3'])

oranges

id1    4
id2    5
id3    6
dtype: int64

---
Creating a DataFrame from two Series

In [None]:
fruits = pd.DataFrame({
    'apples':apples,
    'oranges':oranges
})

fruits

In [None]:
display(type(fruits['oranges']))

fruits['oranges']

In [None]:
#df is represented with 2 [[]]
display(type(fruits[['oranges']]))

fruits[['oranges']]

# Exploratory Data Analysis

Taking a look at some global data

In [None]:
countries_df = pd.read_csv('countries.csv',decimal=',')

countries_df.head()

In [None]:
#pd.read<TAB> easier than documentation
#pd.read<SHIFT+TAB>

In [None]:
countries_df.shape

In [None]:
countries_df.columns

In [None]:
#even better to check columns
countries_df.dtypes

#We also see how many non null we have, good for cleaning
countries_df.info()

In [None]:
#but this is easier to read cause you can count nulls
countries_df.isnull().sum()

In [None]:
#btw - this is ... Boolean Indexing :) so don't forget the sum
countries_df.isnull()

In [None]:
#get key descriptive stats - will be useful for ML!
countries_df.describe()

In [None]:
countries_df.head()

In [None]:
countries_df.tail(10)

**Reading columns**

In [None]:
#This is a Series ... it's harder to read...a nd one dimensional
display(type(countries_df['Country']))
countries_df['Country']

In [None]:
#this is a Dataframe ... we put double brackets! and it's easier to read :)
display(type(countries_df[['Country']]))
countries_df[['Country', 'Region']]
#I am listing the columns I need inside the brackets as a list

In [None]:
#For selecting specific rows ... note: 15 included
countries_df.loc[10:15,['Country', 'Region']]

In [None]:
#remember it starts from 0, not 1
countries_df.loc[0:5,['Country', 'Region']]

In [None]:
#Note - if you put =, you'd be replacing te selected rowns with the value you assign

In [None]:
countries_df.iloc[10:15,0:2]

**Countries with more than a billion people**

In [None]:
condition = countries_df['Population'] > 1_000_000_000
condition

In [None]:
countries_df[countries_df['Population'] > 1_000_000_000]
# countries_df[condition]


**Countries in the Americas**

In [None]:
countries_df.head(20)
#I can see a pattern: they have "AMER" in the Region column

In [None]:
#even better and for confirmation ...
countries_df['Region'].unique()
#notice there is a lot of empty spaces!

In [None]:
# then I create a condition
american = countries_df['Region'].str.contains('AMER')
# if you want to combine, use & and | as or

In [None]:
#and apply it
countries_df[american]

**European Countries**

In [None]:
countries_df[countries_df['Region'].isin(['WESTERN EUROPE', 'EASTERN EUROPE'])]
#this looks for exact match! but the colummns names aren't clean ....

In [None]:
countries_df['Region'].unique()
#theres a lot of empty spaces...needs stripping!

In [None]:
countries_df['Region'] = countries_df['Region'].str.strip()

In [None]:
#Alternative function but slower
# .map goes to each cell, strips it. resulting values are replacing the ones in the df column
countries_df['Country'] = countries_df['Country'].map(str.strip)

In [None]:
countries_df[countries_df['Region'].isin(['WESTERN EUROPE', 'EASTERN EUROPE'])]

In [None]:
countries_df['Region'].unique()

## Reindexing your DataFrame

In [None]:
#I am replacing the default index with the county names
#! inplace = true permanently replace the previous index
countries_df.set_index('Country',inplace=True)

In [None]:
# compare with new_df
countries_df.head()

In [None]:
#Done! so now, thanks to reindexing with country names, I can use .loc by referencing label names to filter
countries_df.loc['France':'Germany', ['Region', 'Population']]

## Sorting

In [None]:
#Note: this is JUST a display
countries_df.sort_index(ascending=False)
#If I wanted to replace it in countries_df
#countries_df.sort_index(ascending=False, inplace=True)

In [None]:
#We can also sort by values
countries_df.sort_values('Population',ascending=False).head(5)
#Note: you could also sort by several values
#countries_df.sort_values(by=['Population','Region'],ascending=(False,True)).head(5)

In [None]:
countries_df.sort_values(by='GDP ($ per capita)', na_position='first').head(5)
#na position puts the results with na first

## Group By

In [None]:
regions = countries_df.groupby('Region')
#this is not giving me what I need yet :)

In [None]:
regions[['Population']].sum()

In [None]:
countries_df.groupby('Region')[['Population', 'Area (sq. mi.)']].sum().sort_values('Population', ascending=False)

## Plotting

In [None]:
# little taste!
%matplotlib inline
import matplotlib

In [None]:
top_10_countries = countries_df[['GDP ($ per capita)']].sort_values('GDP ($ per capita)',ascending=False).head(10)

top_10_countries

In [None]:
top_10_countries.plot(kind='bar');