# Series

In [1]:
#Now we'll learn about dropping entries
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [16]:
#Lets create a Series (array of data and data labels, its index)

obj = Series([3,6,9,12])

#Show
obj

0     3
1     6
2     9
3    12
dtype: int64

In [17]:
#Lets show the values
obj.values

array([ 3,  6,  9, 12], dtype=int64)

In [18]:
#Lets show the index
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [21]:
#Now lets create a Series with an index

#WW2 casualties 
ww2_cas = Series([8700000,4300000,3000000,2100000,400000],index=['USSR','Germany','China','Japan','USA'])

#Show
ww2_cas

USSR       8700000
Germany    4300000
China      3000000
Japan      2100000
USA         400000
dtype: int64

In [22]:
#Now we can use index values to select Series values
ww2_cas['USA']

400000

In [26]:
#Can also check with array operations

#Check who had casualties greater than 4 million
ww2_cas[ww2_cas>4000000]

USSR       8700000
Germany    4300000
dtype: int64

In [27]:
#Can treat Series as ordered dictionary

#Check if USSR is in Series
'USSR' in ww2_cas

True

In [31]:
#Can convert Series into Python dictionary
ww2_dict = ww2_cas.to_dict()

#Show
ww2_dict

{'China': 3000000,
 'Germany': 4300000,
 'Japan': 2100000,
 'USA': 400000,
 'USSR': 8700000}

In [34]:
#Can convert back into a Series
WW2_Series = Series(ww2_dict)

In [35]:
#Show
WW2_Series

China      3000000
Germany    4300000
Japan      2100000
USA         400000
USSR       8700000
dtype: int64

In [36]:
#Passing a dictionary the index will have the dict keys in order
countries = ['China','Germany','Japan','USA','USSR','Argentina']


In [37]:
#Lets redefine a Series
obj2 = Series(ww2_dict,index=countries)

In [38]:
#Show
obj2

China        3000000
Germany      4300000
Japan        2100000
USA           400000
USSR         8700000
Argentina        NaN
dtype: float64

In [39]:
#We can use isnull and notnull to find missing data
pd.isnull(obj2)

#obj2.isnull() 

China        False
Germany      False
Japan        False
USA          False
USSR         False
Argentina     True
dtype: bool

In [40]:
#Same for the opposite
pd.notnull(obj2)

#obj2.notnull()

China         True
Germany       True
Japan         True
USA           True
USSR          True
Argentina    False
dtype: bool

In [41]:
#Lets see the ww2 Series again
WW2_Series

China      3000000
Germany    4300000
Japan      2100000
USA         400000
USSR       8700000
dtype: int64

In [42]:
#Lets check our Series with Argentine again
obj2

China        3000000
Germany      4300000
Japan        2100000
USA           400000
USSR         8700000
Argentina        NaN
dtype: float64

In [43]:
#Now we can add and pandas automatically aligns data by index
WW2_Series + obj2

Argentina         NaN
China         6000000
Germany       8600000
Japan         4200000
USA            800000
USSR         17400000
dtype: float64

In [45]:
#We can give Series names
obj2.name = "World War 2 Casualties"

In [46]:
#Show
obj2

China        3000000
Germany      4300000
Japan        2100000
USA           400000
USSR         8700000
Argentina        NaN
Name: World War 2 Casualties, dtype: float64

In [47]:
#We can also name index
obj2.index.name = 'Countries'

In [48]:
#Show
obj2

Countries
China        3000000
Germany      4300000
Japan        2100000
USA           400000
USSR         8700000
Argentina        NaN
Name: World War 2 Casualties, dtype: float64

In [7]:
list_s = [1,2,-3,6.2,"Data Values"]
print(list_s)

[1, 2, -3, 6.2, 'Data Values']


In [8]:
series1 = pd.Series(list_s)
print(series1) # press 'Tab' for completing variable name

0              1
1              2
2             -3
3            6.2
4    Data Values
dtype: object


In [9]:
type(series1)

pandas.core.series.Series

In [None]:
# What is object?
# what is series?
# Meaning of pandas.core.series.Series

In [11]:
empty_s = pd.Series([]) # press 'Shift + Tab' within parentheses of Series
print(empty_s)

Series([], dtype: float64)


  empty_s = pd.Series([])


In [20]:
# Creating series using single values
series_scaler = pd.Series(0.5, index = ['i'], dtype = float )
print(series_scaler)

i    0.5
dtype: float64


In [21]:
series_scaler = pd.Series(0.5, index = ['i','ii','iii'], dtype = float )
print(series_scaler)

i      0.5
ii     0.5
iii    0.5
dtype: float64


In [23]:
# Creeating series using dictionary
dict_series = pd.Series({'a':1,'b':2})
print(dict_series)

a    1
b    2
dtype: int64


In [24]:
series = pd.Series([1,2,3,4,5])
print(series)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [25]:
series[0] # series[index]

1

In [26]:
series[1:4] # last index is not counted

1    2
2    3
3    4
dtype: int64

In [28]:
max(series)

5

In [29]:
min(series)

1

In [30]:
series[series>3]

3    4
4    5
dtype: int64

# Drop Entry

In [2]:
#Create a new series to play with
ser1 = Series(np.arange(3),index=['a','b','c'])

#Show
ser1

a    0
b    1
c    2
dtype: int64

In [3]:
#Now let's drop an index
ser1.drop('b')

a    0
c    2
dtype: int64

In [4]:
#With a DataFrame we can drop values from either axis
dframe1 = DataFrame(np.arange(9).reshape((3,3)),index=['SF','LA','NY'],columns=['pop','size','year'])

#Show (remember just random values)
dframe1

Unnamed: 0,pop,size,year
SF,0,1,2
LA,3,4,5
NY,6,7,8


In [10]:
#Now dropping a row
dframe1.drop('LA')

Unnamed: 0,pop,size,year
SF,0,1,2
NY,6,7,8


In [13]:
#Or we could drop a column

#Need to specify that axis is 1, not 0
dframe1.drop('year',axis=1)

Unnamed: 0,pop,size
SF,0,1
LA,3,4
NY,6,7


In [1]:
#Next we'll learn about selecting entires in a DataFrame!

# Selecting Entries

In [9]:
#Lets try some Series indexing
ser1 = Series(np.arange(3),index=['A','B','C'])

#multiply all values by 2, to avoid confusion in future
ser1 = 2*ser1

#Show
ser1 

A    0
B    2
C    4
dtype: int32

In [11]:
#Can grab entry by index name
ser1['B']

2

In [13]:
#Or grab by index 
ser1[1]

2

In [15]:
#Can also grab by index range
ser1[0:3]

A    0
B    2
C    4
dtype: int32

In [16]:
#Or grab range by range of index values
ser1[['A','B','C']]

A    0
B    2
C    4
dtype: int32

In [17]:
#Or grab by logic
ser1[ser1>3]

C    4
dtype: int32

In [19]:
#Can also ser using these methods
ser1[ser1>3] = 10

#Show
ser1

A     0
B     2
C    10
dtype: int32

In [20]:
#Now let's see sleection in a DataFrame

dframe = DataFrame(np.arange(25).reshape((5,5)),index=['NYC','LA','SF','DC','Chi'],columns=['A','B','C','D','E'])

#Show
dframe

Unnamed: 0,A,B,C,D,E
NYC,0,1,2,3,4
LA,5,6,7,8,9
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [21]:
#Select by column name
dframe['B']

NYC     1
LA      6
SF     11
DC     16
Chi    21
Name: B, dtype: int32

In [23]:
#Select by multiple columns
dframe[['B','E']]

Unnamed: 0,B,E
NYC,1,4
LA,6,9
SF,11,14
DC,16,19
Chi,21,24


In [24]:
#Can also use boolean
dframe[dframe['C']>8]

Unnamed: 0,A,B,C,D,E
SF,10,11,12,13,14
DC,15,16,17,18,19
Chi,20,21,22,23,24


In [25]:
#Can also just shoe a boolean DataFrame
dframe> 10

Unnamed: 0,A,B,C,D,E
NYC,False,False,False,False,False
LA,False,False,False,False,False
SF,False,True,True,True,True
DC,True,True,True,True,True
Chi,True,True,True,True,True


In [26]:
#Can alos use ix as previously discussed to label-index
dframe.ix['LA']

A    5
B    6
C    7
D    8
E    9
Name: LA, dtype: int32

In [28]:
#Another example
dframe.ix[1]

A    5
B    6
C    7
D    8
E    9
Name: LA, dtype: int32

# Data Alignment

In [2]:
#Lets start by making two Series

ser1 = Series([0,1,2],index=['A','B','C'])

#Show
ser1

A    0
B    1
C    2
dtype: int64

In [5]:
#Now second Series 2
ser2 = Series([3,4,5,6],index=['A','B','C','D'])

#Show 
ser2 

A    3
B    4
C    5
D    6
dtype: int64

In [6]:
#So what happens when we add these together
ser1 + ser2

A     3
B     5
C     7
D   NaN
dtype: float64

In [7]:
#Note the NaN values are added in automatically

In [8]:
# Now let's try it with DataFrames!
dframe1 = DataFrame(np.arange(4).reshape(2,2),columns=list('AB'),index=['NYC','LA'])

#Show
dframe1

Unnamed: 0,A,B
NYC,0,1
LA,2,3


In [10]:
#Second DataFrame
dframe2 = DataFrame(np.arange(9).reshape(3,3),columns=list('ADC'),index=['NYC','SF','LA'])

#Show
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
SF,3,4,5
LA,6,7,8


In [11]:
#What happens when we add them together?

dframe1 + dframe2


Unnamed: 0,A,B,C,D
LA,8.0,,,
NYC,0.0,,,
SF,,,,


In [13]:
#What if we want to replace the NaN values
# Then we can use .add()

dframe1.add(dframe2,fill_value=0)

Unnamed: 0,A,B,C,D
LA,8,3.0,8,7
NYC,0,1.0,2,1
SF,3,,5,4


In [14]:
#Now we can see that the values are filled, however there was no SF,B value so that is still NaN

In [18]:
#Lets learn about operations betwen a Series and a DataFrame

In [19]:
#Show
dframe2

Unnamed: 0,A,D,C
NYC,0,1,2
SF,3,4,5
LA,6,7,8


In [23]:
#Create a Series from DataFrame's 0 row
ser3 = dframe2.ix[0]

#Show
ser3

A    0
D    1
C    2
Name: NYC, dtype: int32

In [24]:
#Now we can use arithmetic operations
dframe2-ser3

Unnamed: 0,A,D,C
NYC,0,0,0
SF,3,3,3
LA,6,6,6
