## Pandas Series

In [2]:
import pandas as pd

In [3]:
# You can create a series by passing in a list of values. 
# When uou do this, Pandas automatically assigns an index starting with zero ans sets the name of
# the series to None. 

# One of the easiest ways to create a series is to use an array-list object.
# Like a list

students = ['Alice', 'Jack', 'Molly']

# Call the Series function in pandas and pass in the students
pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [4]:
# We dont have to use strings. If we passed in a list of whole numbers. For instance,
# we could see that panda sets the typr to int64. Underneath panda nstores series value in a
# typed array using numpy library. This offer significant speedup when processing dada
# versus traditional python lists.

# Let's create a little list of numbers
numbers = [1,2,3]
# And turn that in to a series
pd.Series(numbers)


0    1
1    2
2    3
dtype: int64

In [5]:
# There 's some other typing details that exist for performance that are important to know.
# The most important is how numpy and thus panda handle missing data

# In python, we have none type to indicate a lack of data. But what do we do if we want
# to have a typed list like we do in the series object?

# Underneath, pandas does some type conversion. If we create a list of strings and we have
# one element, a None type, pandas inserts it as a None and uses the type object for the underlying array.

students = ['Alex', 'Jack', None]
# And lets convert this to a series
pd.Series(students)

0    Alex
1    Jack
2    None
dtype: object

In [6]:
# If we create a list of numbers, integers or floats, and put in the None type. Pandas automatically converts this to a special floating point
# value designed as NaN, which stands for Not a number
numbers = [1,2,None]
pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [7]:
# Notice couple of things. First, NaN is a different value. Second, pandas set the dtype
# of this series to floating point numbers instead of object or ints. That's
# maybe a bit of suprise. Why not just leave this as an integer? Underneath, pandas represents NaN as a floating point number
# and because integers can be typecast to floats, pandas went and converted our integers to floats.


In [8]:
# NaN is "NOT" equivilent to None and when we try the equality test, the result is False.

# Lets bring in numpy which allows us to generate an NaN value
import numpy as np
np.nan == None

False

In [9]:
# You cant do an equality test of NaN to itself, when you do, the answer is always False
np.nan == np.nan

False

In [10]:
# Instead, you need to use a specia functions to test for the presence of not a number
# such as the Numpy library isnan()

np.isnan(np.nan)

True

In [11]:
# So NaN means similar to None, but it's a numeric value and treated differntly for efficiency reasons.


In [12]:
# Let's talk more about pandas' Series can be created. While my list might be a common
# way to create some play data. Often you have label data that you want to manipulate
# A series can be created directly from dictionary data. If you do this, the index is 
# automatically assigned to the keys of the dictionary that you provided and not just
# incrementing integers.

# Some examples

students_score = {'Alice': 'Physics',
                 'Jack': 'Chemistry',
                 'Molly': 'English'}
s = pd.Series(students_score)
s


Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [13]:
# We see that, since it was string data, pandas set the data type of the series to "object"
# We see that the index, the first column, is also a list of Strings

In [14]:
# Once the series has been created, we can get the index object using the index attribute

s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [15]:
# The dtype object is not just for strings, but for arbinary objects. Lets create a more complex
# type of data, sa a list of tuples
students = [("Alice", "Brown"), ("Jack","White"), ("Molly", "Green")]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [16]:
# Separate your index creation from the data by passing in the index as a list
# explicitly to the series
s = pd.Series(['Physics', 'Chemistry','English'], index=['Alice','Jack','Molly'])
s


Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [17]:
# Pandas overrides the automatic creation to favor only and all of the indices
# values that you provided. So it will ignore from your dictionary all keys
# which are not in your index, and pandas will add None or NaN type values
# for any index value you provide, which is not in your dictionary key list.

# Example
students_score = {'Alice': 'Physics',
                 'Jack': 'Chemistry',
                 'Molly': 'English'}
# Create a series object excluding Jack
s = pd.Series(students_score, index=['Alex', 'Jack', 'Sam'])
s

Alex          NaN
Jack    Chemistry
Sam           NaN
dtype: object

## Querying the Series

In [2]:
# A pandas Series can be queried either by the index position or the index label. If you dont give an
# index to the series when querying, the position and the label are effectively the same values
# query by numeric location, starting at zero, use the iloc attribute. To query by the index label,
# you can use the loc attribute

# Let's start with an example. We'll use students enrolled in classes coming from a dictionary
import pandas as pd
students_classes = {'Alice': 'Physics',
                  'Jack': 'Chemistry',
                  'Molly': 'English',
                  'Sam': 'History'}
s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [3]:
# So, for this series. If you want to see the fourth entry we could use the
# iloc attribuet with the parameter 3
s.iloc[3]

'History'

In [5]:
# If you want to see what class Molly has, we could use the loc attribute with
# a parameter of Molly
s.loc['Molly']

'English'

In [6]:
# iloc and loc are not methods, they are attribute. So you dont ise parentheses to query them,
# but square brackets instead, which is called the indexing operator.
# In python, this calls get or set for an item depending on the context of its use


In [7]:
# Pandas tries to amke our code a bit more readable and provides a sort of smart syntax
# using the indexing operator directly on the series itself. For instance, if you pass 
# in an integer parameter, the operator will behave as if you want it to query via the iloc attribute
s[3]

'History'

In [8]:
# If you pass in an object, it will query as if you wanted to use the label based on loc attribute
s['Molly']

'English'

In [9]:
# Pandas cant determine automactically whether you're intending to query by index
# position or index label. So you need to be careful when using the indexing operator 
# on the Series itself. The safer option us to be more explicit and use the iloc or loc attribute directly.

# Indexing by classcode form of integer
class_code = {99: 'Physics',
             100: 'Chemistry',
             101: 'English',
             102: 'History'}
s = pd.Series(class_code)

In [10]:
# If we try and call s[0] we get a key error because ther's no item in the classes list with
# an index of zero, instead we have to call iloc explicitly if we want the first item.
s[0]

KeyError: 0

In [11]:
# So that didnt call s.iloc[0] underneath as one might expect, instead it
# generates an error

In [13]:
# Now we know how to get data out of the series. A common task is to want 
# to consider all of the values inside of the series and do some sort of operations.
# This could be trying to find a certain number, or summerizing data or transforming
# the data in some way.


In [14]:
# A typical approach to this would be to iterate over all the items in the series,
# and invoke the operation one is interested in. For instance, we could create a Series of
# integers representing student grades, and just try and get an average grade

grades = pd.Series([90,80,70,60])

total = 0
for grade in grades:
    total+=grade
print(total/len(grades))

75.0


In [15]:
# Pandas and the underlying numpy libraries support a method of computation called vectorization.
# Vectorization works with most of the functions in the numpy library, including the sum function.

In [16]:
# Here's how we would really write the code using the numpy sum method.
# First, we need to import the numpy module

import numpy as np

total = np.sum(grades)
print(total/len(grades))

75.0


In [19]:
# Measure how fast the function sum in numpy

# First, create a big series of random numbers. This is used a lot when demonstrating
# techniques with Pandas
numbers = pd.Series(np.random.randint(1,1000,10000)) # Generate 10000 numbers from 1 to 1000

# Check whether they are random
numbers.head()

0    941
1    202
2    689
3    581
4    344
dtype: int32

In [20]:
# verify that length of the series is correct using the len function
len(numbers)

10000

In [21]:
# The ipython interpreter has something called magic functions begin with a
# percentage sign. If we type this sign and then hit the Tab key, you can see a list
# of the available magic function. You could write your own magic functions too


In [22]:
# Cellular magic function. These start with two percentage signs and wrap the
# code in the current Jupyter cell. The function we;re trying to use is called timeit.
# This function will run our a few times to determine, on average, how long it takes

# Run timeit with our original iterative code. You can give the number of loops that 
# you would like to run. By default, it is 1000 loops. I'll ask timeit here to use 100 runs
# because we're recording this. Note that in order to use a cellular magic function, it has 
# to be the first line in the cell

In [23]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=number
    
total/len(numbers)

1.35 ms ± 26.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [24]:
# Vectorization

In [25]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

The slowest run took 4.02 times longer than the fastest. This could mean that an intermediate result is being cached.
126 µs ± 78.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [26]:
# A related feature in pandas and numpy is called broadcasting. With broadcasting, you can
# apply an operation to every value in the series. Changing the series. For example, if we
# wanted to increased every random variable by 2. we could do so quickly using the += operator
# directly on the Series object

numbers.head()

0    941
1    202
2    689
3    581
4    344
dtype: int32

In [27]:
# Increase everything in the series by 2
numbers+=2
numbers.head()

0    943
1    204
2    691
3    583
4    346
dtype: int32

In [35]:
# The procedural way of doing this would be to iterate though all of the items
# in the series and increase the values directly. Pandas does support iterating through a series
# much like a dictionary, allowing you to unpack values easily.

# WE can use the iteritems() function whcih returns a label and value.
for label, value in numbers.iteritems():
#     numbers.at(label,value+2)
    numbers.set_value(label, value+2)
numbers.head()

AttributeError: 'Series' object has no attribute 'set_value'

In [36]:
# The .loc attribute lets you not only modify data in place, but also add new
# data as well. If the value you pass in as the index doesnt exist, then a new entry is added.
# And keep in mind, indices can have mixed types. While it's important to be aware of the typing 
# going underneath, Pandas will automatically change the underlying NumPy types as appropriate

In [37]:
# Here's an example
s = pd.Series([1,2,3])

# We could add some new value
s.loc['History'] = 102

s

0            1
1            2
2            3
History    102
dtype: int64

In [39]:
students_classes = pd.Series({'Alice': 'Physics',
                            'Jack': 'Chemistry',
                            'Molly': 'English',
                            'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [40]:
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index=['Kelly','Kelly','Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [41]:
# Append all of the data in this new Series to the first using the .append() function
all_students_classes = students_classes.append(kelly_classes)

all_students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [42]:
# Some important considerations when using append. First, Pandas will take
# the series and try to infer the best data types to use. In this example, everything is a string.
# Second, the append method doesnt actually change the underlying Series objects.
# It instead returns a new series which is made up of the two appended together. this is a
# common pattern in pandas - by default returning a new object instead of modifying in place - and
# one you should come to expect. By printing the original series we can see that series hasnt changed
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [43]:
# Finally, we see that when we query the appended series for Kelly, we dont get a single value.
# but a series itself
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object