In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## One dimensional data structures

1. Pandas: Series 
    - more features
    - built on top of NumPy arrays
2. Numpy: Array 
    - simpler
    - similar to a Python list
    
**Similarities:**
    - access elements by position: `a[0] -> 'AL'`
    - access a range of elements (slicing): `a[1:3] -> 'AK','AZ'`
    - use loops: `for x in a:`
**Differences:**
    1. NumPy: 
        - each element should have the same type;
        - includes convenient functions (mean(), std())
        - can be multi-dimensional

In [2]:
# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

In [3]:
# Accessing elements
print(countries[0])
print(countries[3])

Afghanistan
Angola


In [4]:
# Slicing
print(countries[0:3])
print(countries[:3])
print(countries[17:])
print(countries[:])

['Afghanistan' 'Albania' 'Algeria']
['Afghanistan' 'Albania' 'Algeria']
['Bhutan' 'Bolivia' 'Bosnia and Herzegovina']
['Afghanistan' 'Albania' 'Algeria' 'Angola' 'Argentina' 'Armenia'
 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain' 'Bangladesh'
 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Benin' 'Bhutan' 'Bolivia'
 'Bosnia and Herzegovina']


In [5]:
# Element types
print(countries.dtype)
print(employment.dtype)
print(np.array([0, 1, 2, 3]).dtype)
print(np.array([1.0, 1.5, 2.0, 2.5]).dtype)
print(np.array([True, False, True]).dtype)
print(np.array(['AL', 'AK', 'AZ', 'AR', 'CA']).dtype)

<U22
float64
int32
float64
bool
<U2


In [6]:
# Looping
for country in countries:
    print('Examining country {}'.format(country))

for i in range(len(countries)):
    country = countries[i]
    country_employment = employment[i]
    print('Country {} has employment {}'.format(country,
            country_employment))

Examining country Afghanistan
Examining country Albania
Examining country Algeria
Examining country Angola
Examining country Argentina
Examining country Armenia
Examining country Australia
Examining country Austria
Examining country Azerbaijan
Examining country Bahamas
Examining country Bahrain
Examining country Bangladesh
Examining country Barbados
Examining country Belarus
Examining country Belgium
Examining country Belize
Examining country Benin
Examining country Bhutan
Examining country Bolivia
Examining country Bosnia and Herzegovina
Country Afghanistan has employment 55.70000076
Country Albania has employment 51.40000153
Country Algeria has employment 50.5
Country Angola has employment 75.69999695
Country Argentina has employment 58.40000153
Country Armenia has employment 40.09999847
Country Australia has employment 61.5
Country Austria has employment 57.09999847
Country Azerbaijan has employment 60.90000153
Country Bahamas has employment 66.59999847
Country Bahrain has employmen

In [7]:
# Numpy functions
print(employment.mean())
print(employment.std())
print(employment.max())
print(employment.sum())

58.68500003850001
9.338269113687888
75.69999695
1173.70000077


In [8]:
countries

array(['Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
       'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
       'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
       'Belize', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina'],
      dtype='<U22')

In [9]:
employment

array([55.70000076, 51.40000153, 50.5       , 75.69999695, 58.40000153,
       40.09999847, 61.5       , 57.09999847, 60.90000153, 66.59999847,
       60.40000153, 68.09999847, 66.90000153, 53.40000153, 48.59999847,
       56.79999924, 71.59999847, 58.40000153, 70.40000153, 41.20000076])

In [10]:
i = np.where(employment == employment.max())
i[0][0]
countries[np.where(employment == employment.max())[0][0]]

'Angola'

In [11]:
countries[employment.argmax()] 

'Angola'

In [12]:
employment.max()

75.69999695

In [13]:
def max_employment(countries, employment):
    '''
    Fill in this function to return the name of the country
    with the highest employment in the given employment
    data, and the employment in that country.
    '''
    max_country = countries[employment.argmax()]      # Replace this with your code
    max_value = employment.max()   # Replace this with your code

    return (max_country, max_value)

In [14]:
max_employment(countries, employment)

('Angola', 75.69999695)

## NumPy support Vectorized Operations

### Math Operations:

1. A vector is a list of numbers
2. Adding 2 vectors `+`= vector addition 
3. Multiplying by a Scalar `*`: `(1, 2, 3) * 3 = 3, 6, 9` 
4. Divide: `\`
5. Exponentiate: `**`

### Logical Operations

1. And: `&`
2. Or: `|`
3. Not: `~`

### Comparison Operations

1. Greater or equal: `>=`
2. Less or equal: `<=`
3. Equal: `==`
4. Not Equal: `!=`

In [15]:
# Arithmetic operations between 2 NumPy arrays
a = np.array([1, 2, 3, 4])
b = np.array([1, 2, 1, 2])

print(a + b)
print(a - b)
print(a * b)
print(a / b)
print(a ** b)

[2 4 4 6]
[0 0 2 2]
[1 4 3 8]
[1. 1. 3. 2.]
[ 1  4  3 16]


In [16]:
# Arithmetic operations between a NumPy array and a single number
a = np.array([1, 2, 3, 4])
b = 2

print(a + b)
print(a - b)
print(a * b)
print(a / b)
print(a ** b)

[3 4 5 6]
[-1  0  1  2]
[2 4 6 8]
[0.5 1.  1.5 2. ]
[ 1  4  9 16]


In [17]:
# Logical operations with NumPy arrays

a = np.array([True, True, False, False])
b = np.array([True, False, True, False])

print( a & b)
print( a | b)
print( ~a)

print( a & True)
print( a & False)

print( a | True)
print( a | False)

[ True False False False]
[ True  True  True False]
[False False  True  True]
[ True  True False False]
[False False False False]
[ True  True  True  True]
[ True  True False False]


In [18]:
# Comparison operations between 2 NumPy Arrays
a = np.array([1, 2, 3, 4, 5])
b = np.array([5, 4, 3, 2, 1])

print(a > b)
print(a >= b)
print(a < b)
print(a <= b)
print(a == b)
print(a != b)

[False False False  True  True]
[False False  True  True  True]
[ True  True False False False]
[ True  True  True False False]
[False False  True False False]
[ True  True False  True  True]


In [19]:
# Comparison operations between a NumPy array and a single number
a = np.array([1, 2, 3, 4])
b = 2

print(a > b)
print(a >= b)
print(a < b)
print(a <= b)
print(a == b)
print(a != b)

[False False  True  True]
[False  True  True  True]
[ True False False False]
[ True  True False False]
[False  True False False]
[ True False  True  True]


In [20]:
# First 20 countries with school completion data
countries = np.array([
       'Algeria', 'Argentina', 'Armenia', 'Aruba', 'Austria','Azerbaijan',
       'Bahamas', 'Barbados', 'Belarus', 'Belgium', 'Belize', 'Bolivia',
       'Botswana', 'Brunei', 'Bulgaria', 'Burkina Faso', 'Burundi',
       'Cambodia', 'Cameroon', 'Cape Verde'
])

# Female school completion rate in 2007 for those 20 countries
female_completion = np.array([
    97.35583,  104.62379,  103.02998,   95.14321,  103.69019,
    98.49185,  100.88828,   95.43974,   92.11484,   91.54804,
    95.98029,   98.22902,   96.12179,  119.28105,   97.84627,
    29.07386,   38.41644,   90.70509,   51.7478 ,   95.45072
])

# Male school completion rate in 2007 for those 20 countries
male_completion = np.array([
     95.47622,  100.66476,   99.7926 ,   91.48936,  103.22096,
     97.80458,  103.81398,   88.11736,   93.55611,   87.76347,
    102.45714,   98.73953,   92.22388,  115.3892 ,   98.70502,
     37.00692,   45.39401,   91.22084,   62.42028,   90.66958
])

In [21]:
(female_completion + male_completion)/2

array([ 96.416025, 102.644275, 101.41129 ,  93.316285, 103.455575,
        98.148215, 102.35113 ,  91.77855 ,  92.835475,  89.655755,
        99.218715,  98.484275,  94.172835, 117.335125,  98.275645,
        33.04039 ,  41.905225,  90.962965,  57.08404 ,  93.06015 ])

In [22]:
len(female_completion)

20

In [25]:
def overall_completion_rate(female_completion, male_completion):
    '''
    Fill in this function to return a NumPy array containing the overall
    school completion rate for each country. The arguments are NumPy
    arrays giving the female and male completion of each country in
    the same order.
    '''
    return (female_completion + male_completion)/2

In [26]:
overall_completion_rate(female_completion, male_completion)

array([ 96.416025, 102.644275, 101.41129 ,  93.316285, 103.455575,
        98.148215, 102.35113 ,  91.77855 ,  92.835475,  89.655755,
        99.218715,  98.484275,  94.172835, 117.335125,  98.275645,
        33.04039 ,  41.905225,  90.962965,  57.08404 ,  93.06015 ])

## Standardizing Data

**How does one data point compare to the rest?**

eg: employment in US vs other countries

### Convert each data point to number of standard deviations away from the mean

**2007:** 

- mean employment rate: 58.6%
- standard deviation: 10.5%
- US: 62.3%
- Mexico: 57.9%

**Difference:**

- US: mean employment rate: 3.7% or 0.35 sd
- Mexico: 0.7% or -0.067 sd

In [27]:
# First 20 countries with employment data
countries = np.array([
    'Afghanistan', 'Albania', 'Algeria', 'Angola', 'Argentina',
    'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas',
    'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium',
    'Belize', 'Benin', 'Bhutan', 'Bolivia',
    'Bosnia and Herzegovina'
])

# Employment data in 2007 for those 20 countries
employment = np.array([
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076
])

In [28]:
mean_empolyment = employment.mean()
sd_employment = employment.std()
mean_empolyment, sd_employment

(58.68500003850001, 9.338269113687888)

In [29]:
(employment - employment.mean()) / employment.std()

array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,
       -1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,
        0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,
       -0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])

In [30]:
def standardize_data(values):
    '''
    Fill in this function to return a standardized version of the given values,
    which will be in a NumPy array. Each value should be translated into the
    number of standard deviations that value is away from the mean of the data.
    (A positive number indicates a value higher than the mean, and a negative
    number indicates a value lower than the mean.)
    '''
    standardized_values = ((employment - employment.mean()) / employment.std())
    return standardized_values

In [31]:
standardize_data(employment)

array([-0.31965231, -0.780123  , -0.87650077,  1.82207181, -0.03051941,
       -1.99019768,  0.30144772, -0.16973184,  0.23719615,  0.84758731,
        0.18365304,  1.00821665,  0.87971351, -0.56595055, -1.07996476,
       -0.20185762,  1.38301845, -0.03051941,  1.2545153 , -1.87240259])

## NumPy Index Arrays

Two arrays of the same length, the second contains booleans, the first can contain any type.
`
a = (1, 2, 3, 4, 5)
b = (F, F, T, T, T) -> index array
b = a > 2
a[b] = (3, 4, 5)
a[a > 2] -> keep all elements of a that are greater than 2
`

In [32]:
# Using index arrays
a = np.array([1, 2, 3, 4])
b = np.array([True, True, False, False])

print(a[b])
print(a[np.array([True, False, True, False])])

[1 2]
[1 3]


In [33]:
# Creating the index array using vectorized operations
a = np.array([1, 2, 3, 2, 1])
b = (a >= 2)

print(a[b])
print(a[a >= 2])

[2 3 2]
[2 3 2]


In [34]:
# Creating the index array using vectorized operations on another array
a = np.array([1, 2, 3, 4, 5])
b = np.array([1, 2, 3, 2, 1])

print(b == 2)
print(a[b == 2])

[False  True False  True False]
[2 4]


In [35]:
def mean_time_for_paid_students(time_spent, days_to_cancel):
    '''
    Fill in this function to calculate the mean time spent in the classroom
    for students who stayed enrolled at least (greater than or equal to) 7 days.
    Unlike in Lesson 1, you can assume that days_to_cancel will contain only
    integers (there are no students who have not canceled yet).
    
    The arguments are NumPy arrays. time_spent contains the amount of time spent
    in the classroom for each student, and days_to_cancel contains the number
    of days until each student cancel. The data is given in the same order
    in both arrays.
    '''
    return time_spent[days_to_cancel >= 7].mean()

In [36]:
# Time spent in the classroom in the first week for 20 students
time_spent = np.array([
       12.89697233,    0.        ,   64.55043217,    0.        ,
       24.2315615 ,   39.991625  ,    0.        ,    0.        ,
      147.20683783,    0.        ,    0.        ,    0.        ,
       45.18261617,  157.60454283,  133.2434615 ,   52.85000767,
        0.        ,   54.9204785 ,   26.78142417,    0.
])

# Days to cancel for 20 students
days_to_cancel = np.array([
      4,   5,  37,   3,  12,   4,  35,  38,   5,  37,   3,   3,  68,
     38,  98,   2, 249,   2, 127,  35
])

In [37]:
days_to_cancel[days_to_cancel >= 7]

array([ 37,  12,  35,  38,  37,  68,  38,  98, 249, 127,  35])

In [38]:
[days_to_cancel >= 7]

[array([False, False,  True, False,  True, False,  True,  True, False,
         True, False, False,  True,  True,  True, False,  True, False,
         True,  True])]

In [39]:
time_spent[days_to_cancel >= 7].mean()

41.05400348545454

In [40]:
mean_time_for_paid_students(time_spent, days_to_cancel)

41.05400348545454

## Difference + vs +=

1. `=` creates a new array: `a = a + np.array([1, 1, 1, 1])`
2. `+=` points to the original array: `a += np.array([1, 1, 1, 1])`

## Place vs Not In-Place

1. `+=` operates in-place: is storing all new values in the same places the original values were stored rather than creating a new array to store them.  
2.  `+` does not 

In [41]:
a = np.array([1,2,3,4,5])
slice = a[:3]  # view from the original array 
slice[0] = 100
slice, a

(array([100,   2,   3]), array([100,   2,   3,   4,   5]))

## Pandas Series

Is similar to a numpy array, with extra functionality: eg describe()

**Similarities:** all you can do with NumPy array:
    - accessing elements;
    - looping;
    - convenient functions;
    - vectorized operations;
    - implemented in C;
    
Python: `True + True = 2`

In [42]:
countries = ['Albania', 'Algeria', 'Andorra', 'Angola', 'Antigua and Barbuda',
             'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan',
             'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus',
             'Belgium', 'Belize', 'Benin', 'Bhutan', 'Bolivia']

life_expectancy_values = [74.7,  75. ,  83.4,  57.6,  74.6,  75.4,  72.3,  81.5,  80.2,
                          70.3,  72.1,  76.4,  68.1,  75.2,  69.8,  79.4,  70.8,  62.7,
                          67.3,  70.6]

gdp_values = [ 1681.61390973,   2155.48523109,  21495.80508273,    562.98768478,
              13495.1274663 ,   9388.68852258,   1424.19056199,  24765.54890176,
              27036.48733192,   1945.63754911,  21721.61840978,  13373.21993972,
                483.97086804,   9783.98417323,   2253.46411147,  25034.66692293,
               3680.91642923,    366.04496652,   1175.92638695,   1132.21387981]

# Life expectancy and gdp data in 2007 for 20 countries
life_expectancy = pd.Series(life_expectancy_values)
gdp = pd.Series(gdp_values)

In [43]:
# Accessing elements and slicing
print(life_expectancy[0])
print(gdp[3:6])

74.7
3      562.987685
4    13495.127466
5     9388.688523
dtype: float64


In [44]:
# Looping
for country_life_expectancy in life_expectancy:
    print('Examining life expectancy {}'.format(country_life_expectancy))

Examining life expectancy 74.7
Examining life expectancy 75.0
Examining life expectancy 83.4
Examining life expectancy 57.6
Examining life expectancy 74.6
Examining life expectancy 75.4
Examining life expectancy 72.3
Examining life expectancy 81.5
Examining life expectancy 80.2
Examining life expectancy 70.3
Examining life expectancy 72.1
Examining life expectancy 76.4
Examining life expectancy 68.1
Examining life expectancy 75.2
Examining life expectancy 69.8
Examining life expectancy 79.4
Examining life expectancy 70.8
Examining life expectancy 62.7
Examining life expectancy 67.3
Examining life expectancy 70.6


In [45]:
# Pandas functions
print(life_expectancy.mean())
print(life_expectancy.std())
print(gdp.max())
print(gdp.sum())

72.86999999999999
6.213999474869968
27036.48733192
182957.59832967006


In [46]:
# Vectorized operations and index arrays
a = pd.Series([1, 2, 3, 4])
b = pd.Series([1, 2, 1, 2])

print(a + b)
print(a * 2)
print(a >= 3)
print(a[a >= 3])

0    2
1    4
2    4
3    6
dtype: int64
0    2
1    4
2    6
3    8
dtype: int64
0    False
1    False
2     True
3     True
dtype: bool
2    3
3    4
dtype: int64


In [47]:
a = pd.Series([1, 2, 3, 4])
b = pd.Series([4, 5, 6, 7])

In [48]:
c = pd.Series([1, 2, 3, 4])
d = pd.Series([7, 6, 5, 4])

In [49]:
sum(d<d.mean()), sum(d>d.mean())

(2, 2)

In [50]:
gdp, life_expectancy

(0      1681.613910
 1      2155.485231
 2     21495.805083
 3       562.987685
 4     13495.127466
 5      9388.688523
 6      1424.190562
 7     24765.548902
 8     27036.487332
 9      1945.637549
 10    21721.618410
 11    13373.219940
 12      483.970868
 13     9783.984173
 14     2253.464111
 15    25034.666923
 16     3680.916429
 17      366.044967
 18     1175.926387
 19     1132.213880
 dtype: float64, 0     74.7
 1     75.0
 2     83.4
 3     57.6
 4     74.6
 5     75.4
 6     72.3
 7     81.5
 8     80.2
 9     70.3
 10    72.1
 11    76.4
 12    68.1
 13    75.2
 14    69.8
 15    79.4
 16    70.8
 17    62.7
 18    67.3
 19    70.6
 dtype: float64)

In [51]:
def variable_correlation(variable1, variable2):
    '''
    Fill in this function to calculate the number of data points for which
    the directions of variable1 and variable2 relative to the mean are the
    same, and the number of data points for which they are different.
    Direction here means whether each value is above or below its mean.
    
    You can classify cases where the value is equal to the mean for one or
    both variables however you like.
    
    Each argument will be a Pandas series.
    
    For example, if the inputs were pd.Series([1, 2, 3, 4]) and
    pd.Series([4, 5, 6, 7]), then the output would be (4, 0).
    This is because 1 and 4 are both below their means, 2 and 5 are both
    below, 3 and 6 are both above, and 4 and 7 are both above.
    
    On the other hand, if the inputs were pd.Series([1, 2, 3, 4]) and
    pd.Series([7, 6, 5, 4]), then the output would be (0, 4).
    This is because 1 is below its mean but 7 is above its mean, and
    so on.
    '''
    both_above = (variable1 > variable1.mean()) & \
             (variable2 > variable2.mean())
    both_below = (variable1 < variable1.mean()) & \
                 (variable2 < variable2.mean())
    is_same_direction = both_above | both_below
    num_same_direction = is_same_direction.sum()
    num_different_direction = len(variable1) - num_same_direction
    
    return (num_same_direction, num_different_direction)

In [52]:
# indicates that they are positively correlated
variable_correlation(life_expectancy, gdp)

(17, 3)

## Pandas Indexes

- Pandas series has index
- NumPy arrays are like souped-up Python lists
- A Pandas series is like a cross between a list and a dictionary:
    - list: elements stored in order and access them by their position;
    - dictionary: you can have a key and a value, and look up values by keys;
- Pandas attribute `loc` that lets you use values by their index: `life_expectancy.loc['Angola']`
- Pandas attribute `iloc` that lets you access elements by position: `life_expectancy.iloc[0]`
- Pandas `idxmax()` - returns the index of the first maximally-valued element. 
- You can find documentation for the idxmax() function in Pandas [here](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html).

In [53]:
life_expectancy = pd.Series([74.7, 75., 83.4, 57.6],
                               index=['Albania',
                                     'Algeria',
                                     'Andorra',
                                     'Angola'])
life_expectancy

Albania    74.7
Algeria    75.0
Andorra    83.4
Angola     57.6
dtype: float64

In [54]:
life_expectancy[0]

74.7

In [55]:
life_expectancy.loc['Angola']

57.6

In [56]:
life_expectancy.iloc[0]

74.7

In [57]:
countries = [
    'Afghanistan', 'Albania', 'Algeria', 'Angola',
    'Argentina', 'Armenia', 'Australia', 'Austria',
    'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh',
    'Barbados', 'Belarus', 'Belgium', 'Belize',
    'Benin', 'Bhutan', 'Bolivia', 'Bosnia and Herzegovina',
]


employment_values = [
    55.70000076,  51.40000153,  50.5       ,  75.69999695,
    58.40000153,  40.09999847,  61.5       ,  57.09999847,
    60.90000153,  66.59999847,  60.40000153,  68.09999847,
    66.90000153,  53.40000153,  48.59999847,  56.79999924,
    71.59999847,  58.40000153,  70.40000153,  41.20000076,
]

# Employment data in 2007 for 20 countries
employment = pd.Series(employment_values, index=countries)
employment

Afghanistan               55.700001
Albania                   51.400002
Algeria                   50.500000
Angola                    75.699997
Argentina                 58.400002
Armenia                   40.099998
Australia                 61.500000
Austria                   57.099998
Azerbaijan                60.900002
Bahamas                   66.599998
Bahrain                   60.400002
Bangladesh                68.099998
Barbados                  66.900002
Belarus                   53.400002
Belgium                   48.599998
Belize                    56.799999
Benin                     71.599998
Bhutan                    58.400002
Bolivia                   70.400002
Bosnia and Herzegovina    41.200001
dtype: float64

In [58]:
employment.max()

75.69999695

In [59]:
employment.idxmax()

'Angola'

In [60]:
def max_employment(employment):
    '''
    Fill in this function to return the name of the country
    with the highest employment in the given employment
    data, and the employment in that country.
    
    The input will be a Pandas series where the values
    are employment and the index is country names.
    
    Try using the Pandas idxmax() function. Documention can
    be found here:
    http://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.idxmax.html
    '''
    max_country = employment.idxmax()   
    max_value = employment.max()

    return (max_country, max_value)

In [61]:
max_employment(employment)

('Angola', 75.69999695)

## Vectorized Operations and Series Indexes


In [62]:
# Addition when indexes are the same
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['a', 'b', 'c', 'd'])
print(s1 + s2)

a    11
b    22
c    33
d    44
dtype: int64


In [63]:
# Indexes have same elements in a different order
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['b', 'd', 'a', 'c'])
print(s1 + s2)

a    31
b    12
c    43
d    24
dtype: int64


In [64]:
# Indexes overlap, but do not have exactly the same elements
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
print(s1 + s2)

a     NaN
b     NaN
c    13.0
d    24.0
e     NaN
f     NaN
dtype: float64


In [65]:
# Indexes do not overlap
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['e', 'f', 'g', 'h'])
print(s1 + s2)

a   NaN
b   NaN
c   NaN
d   NaN
e   NaN
f   NaN
g   NaN
h   NaN
dtype: float64


## Filling Missing Values

- if the indexes don't match NaN will be filled in 

In [66]:
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([10, 20, 30, 40], index=['c', 'd', 'e', 'f'])
sum_result = s1 + s2

In [67]:
# use inplace = True to modify the starting object without needing to reassign it.
sum_result.dropna()

c    13.0
d    24.0
dtype: float64

In [68]:
# Alternate solution: treat missing values as 0 before the addiion
s1.add(s2, fill_value=0)

a     1.0
b     2.0
c    13.0
d    24.0
e    30.0
f    40.0
dtype: float64

## Pandas Series apply()

- for computations that aren't built into Pandas
- takes a series and a function and returns a new series by applying the function to every element of the original series
- it is like 'map()' but it works on series instead of lists
- `s+3` the same as `s.apply(add3)`

In [69]:
# Example pandas apply() usage (although this could have been done
# without apply() using vectorized operations)
s = pd.Series([1, 2, 3, 4, 5])
def add_one(x):
    return x + 1
print(s.apply(add_one))

0    2
1    3
2    4
3    5
4    6
dtype: int64


In [70]:
names = pd.Series([
    'Andre Agassi',
    'Barry Bonds',
    'Christopher Columbus',
    'Daniel Defoe',
    'Emilio Estevez',
    'Fred Flintstone',
    'Greta Garbo',
    'Humbert Humbert',
    'Ivan Ilych',
    'James Joyce',
    'Keira Knightley',
    'Lois Lane',
    'Mike Myers',
    'Nick Nolte',
    'Ozzy Osbourne',
    'Pablo Picasso',
    'Quirinus Quirrell',
    'Rachael Ray',
    'Susan Sarandon',
    'Tina Turner',
    'Ugueth Urbina',
    'Vince Vaughn',
    'Woodrow Wilson',
    'Yoji Yamada',
    'Zinedine Zidane'
])

In [71]:
names

0             Andre Agassi
1              Barry Bonds
2     Christopher Columbus
3             Daniel Defoe
4           Emilio Estevez
5          Fred Flintstone
6              Greta Garbo
7          Humbert Humbert
8               Ivan Ilych
9              James Joyce
10         Keira Knightley
11               Lois Lane
12              Mike Myers
13              Nick Nolte
14           Ozzy Osbourne
15           Pablo Picasso
16       Quirinus Quirrell
17             Rachael Ray
18          Susan Sarandon
19             Tina Turner
20           Ugueth Urbina
21            Vince Vaughn
22          Woodrow Wilson
23             Yoji Yamada
24         Zinedine Zidane
dtype: object

In [72]:
def reverse_name(name):
    split_name = name.split(" ")
    first_name = split_name[0]
    last_name = split_name[1]
    return last_name + ', ' + first_name

In [73]:
def reverse_names(names):
    '''
    Fill in this function to return a new series where each name
    in the input series has been transformed from the format
    "Firstname Lastname" to "Lastname, FirstName".
    
    Try to use the Pandas apply() function rather than a loop.
    '''
    
    return names.apply(reverse_name)

In [74]:
reverse_names(names)

0             Agassi, Andre
1              Bonds, Barry
2     Columbus, Christopher
3             Defoe, Daniel
4           Estevez, Emilio
5          Flintstone, Fred
6              Garbo, Greta
7          Humbert, Humbert
8               Ilych, Ivan
9              Joyce, James
10         Knightley, Keira
11               Lane, Lois
12              Myers, Mike
13              Nolte, Nick
14           Osbourne, Ozzy
15           Picasso, Pablo
16       Quirrell, Quirinus
17             Ray, Rachael
18          Sarandon, Susan
19             Turner, Tina
20           Urbina, Ugueth
21            Vaughn, Vince
22          Wilson, Woodrow
23             Yamada, Yoji
24         Zidane, Zinedine
dtype: object

## Plotting in Pandas
If the variable `data` is a NumPy array or a Pandas Series, just like if it is a list, the code

`import matplotlib.pyplot as plt
plt.hist(data)`

will create a histogram of the data.

Pandas also has built-in plotting that uses matplotlib behind the scenes, so if `data` is a Series, you can create a histogram using `data.hist()`.

There's no difference between these two in this case, but sometimes the Pandas wrapper can be more convenient. For example, you can make a line plot of a series using `data.plot()`. The index of the Series will be used for the x-axis and the values for the y-axis.

In [75]:
a = np.array([1, 2, 3, 4])
b = np.array([4, 3, 2, 1])
np.dot(a,b)

20

In [76]:
a = np.array([1, 2, 3, 4])
b = np.array([[1,2,3,4],
            [1,2,3,4],
            [1,2,3,4],
            [1,2,3,4]])

In [77]:
a*b

array([[ 1,  4,  9, 16],
       [ 1,  4,  9, 16],
       [ 1,  4,  9, 16],
       [ 1,  4,  9, 16]])

In [78]:
np.dot(a,b)

array([10, 20, 30, 40])

In [79]:
np.matmul(a,b)

array([10, 20, 30, 40])

In [80]:
np.matmul(b,a)

array([30, 30, 30, 30])

In [81]:
np.dot(b,a)

array([30, 30, 30, 30])