Starting with Numpy

In [169]:
#load the library and check its version, just to make sure we aren't using an older version
import numpy as np
np.__version__
'1.12.1'

'1.12.1'

In [170]:
#create a list comprising numbers from 0 to 9
L = list(range(10))

In [171]:
#converting integers to string - this style of handling lists is known as list comprehension.
#List comprehension offers a versatile way to handle list manipulations tasks easily. We'll learn about them in future tutorials. Here's an example.  

[str(c) for c in L]
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

[type(item) for item in L]
[int, int, int, int, int, int, int, int, int, int]

[int, int, int, int, int, int, int, int, int, int]

Creating Arrays

In [172]:
#creating arrays
np.zeros(10, dtype='int')
array=([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


#creating a 3 row x 5 column matrix
np.ones((3,5), dtype=float)
array=([[ 1.,  1.,  1.,  1.,  1.],
      [ 1.,  1.,  1.,  1.,  1.],
      [ 1.,  1.,  1.,  1.,  1.]])


#creating a matrix with a predefined value
np.full((3,5),1.23)
array=([[ 1.23,  1.23,  1.23,  1.23,  1.23],
      [ 1.23,  1.23,  1.23,  1.23,  1.23],
      [ 1.23,  1.23,  1.23,  1.23,  1.23]])


#create an array with a set sequence
np.arange(0, 20, 2)
array=([0, 2, 4, 6, 8,10,12,14,16,18])


#create an array of even space between the given range of values
np.linspace(0, 1, 5)
array=([ 0., 0.25, 0.5 , 0.75, 1.])


#create a 3x3 array with mean 0 and standard deviation 1 in a given dimension
np.random.normal(0, 1, (3,3))
array=([[ 0.72432142, -0.90024075,  0.27363808],
      [ 0.88426129,  1.45096856, -1.03547109],
      [-0.42930994, -1.02284441, -1.59753603]])


#create an identity matrix
np.eye(3)
array=([[ 1.,  0.,  0.],
      [ 0.,  1.,  0.],
      [ 0.,  0.,  1.]])


#set a random seed
np.random.seed(0)


x1 = np.random.randint(10, size=6) #one dimension
x2 = np.random.randint(10, size=(3,4)) #two dimension
x3 = np.random.randint(10, size=(3,4,5)) #three dimension


print("x3 ndim:", x3.ndim)
print("x3 shape:", x3.shape)
print("x3 size: ", x3.size)
('x3 ndim:', 3)
('x3 shape:', (3, 4, 5))
('x3 size: ', 60)

<class 'TypeError'>: 'list' object is not callable

Array Indexing

In [173]:
x1 = np.array=([4, 3, 4, 4, 8, 4])
x1
array=([4, 3, 4, 4, 8, 4])

#assess value to index zero
x1[0]
4

#assess fifth value
x1[4]
8

#get the last value
x1[-1]
4

#get the second last value
x1[-2]
8

#in a multidimensional array, we need to specify row and column index
x2
array=([[3, 7, 5, 5],
      [0, 1, 5, 9],
      [3, 0, 5, 0]])


#1st row and 2nd column value
x2[2,3]
0

#3rd row and last value from the 3rd column
x2[2,-1]
0


#replace value at 0,0 index
x2[0,0] = 12
x2
array=([[12,  7,  5,  5],
      [ 0,  1,  5,  9],
      [ 3,  0,  5,  0]])

Array Slicing

In [174]:
x = np.arange(10)
x
array=([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


#from start to 4th position
x[:5]
array=([0, 1, 2, 3, 4])


#from 4th position to end
x[4:]
array=([4, 5, 6, 7, 8, 9])


#from 4th to 6th position
x[4:7]
array=([4, 5, 6])


#return elements at even place
x[ : : 2]
array=([0, 2, 4, 6, 8])


#return elements from first position step by two
x[1::2]
array=([1, 3, 5, 7, 9])


#reverse the array
x[::-1]
array=([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

Array Concatenation

In [175]:
#You can concatenate two or more arrays at once.
x = np.array=([1, 2, 3])
y = np.array=([3, 2, 1])
z = [21,21,21]
np.concatenate([x, y,z])
array=([ 1,  2,  3,  3,  2,  1, 21, 21, 21])


#You can also use this function to create 2-dimensional arrays.
grid = np.array=([[1,2,3],[4,5,6]])
np.concatenate([grid,grid])
array=([[1, 2, 3],
      [4, 5, 6],
      [1, 2, 3],
      [4, 5, 6]])


#Using its axis parameter, you can define row-wise or column-wise matrix
np.concatenate([grid,grid],axis=1)
array=([[1, 2, 3, 1, 2, 3],
      [4, 5, 6, 4, 5, 6]])

In [176]:
x = np.array=([3,4,5])
grid = np.array=([[1,2,3],[17,18,19]])
np.vstack([x,grid])
array=([[ 3,  4,  5],
      [ 1,  2,  3],
      [17, 18, 19]])


#Similarly, you can add an array using np.hstack
z = np.array=([[9],[9]])
np.hstack([grid,z])
array=([[ 1,  2,  3,  9],
      [17, 18, 19,  9]])

x = np.arange(10)
x
array=([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])


x1,x2,x3 = np.split(x,[3,6])
print (x1,x2,x3)
    [0 1 2] [3 4 5] [6 7 8 9]

grid = np.arange(16).reshape((4,4))
grid
upper,lower = np.vsplit(grid,[2])
print (upper, lower)
(array=([[0, 1, 2, 3],
       [4, 5, 6, 7]]), array([[ 8,  9, 10, 11],
       [12, 13, 14, 15]]))

Let's start with Pandas

In [177]:
#load library - pd is just an alias. I used pd because it's short and literally abbreviates pandas.
#You can use any name as an alias. 
import pandas as pd

In [178]:
#create a data frame - dictionary is used here where keys get converted to column names and values to row values.
data = pd.DataFrame({'Country': ['Russia','Colombia','Chile','Equador','Nigeria'],
                    'Rank':[121,40,100,130,11]})
data

<class 'TypeError'>: 'list' object is not callable

In [179]:
#We can do a quick analysis of any data set using:
data.describe()

<class 'TypeError'>: 'list' object is not callable

In [180]:
#Among other things, it shows the data set has 5 rows and 2 columns with their respective names.
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
Country    5 non-null object
Rank       5 non-null int64
dtypes: int64(1), object(1)
memory usage: 152.0+ bytes


#Let's create another data frame.
data = pd.DataFrame({'group':['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],'ounces':[4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

<class 'SyntaxError'>: invalid syntax (<ipython-input-180-b2a8ed4e2d24>, line 3)

In [181]:
#Let's sort the data frame by ounces - inplace = True will make changes to the data
data.sort_values(by=['ounces'],ascending=True,inplace=False)

<class 'TypeError'>: 'list' object is not callable

In [182]:
data.sort_values(by=['group','ounces'],ascending=[True,False],inplace=False)

<class 'TypeError'>: 'list' object is not callable

In [183]:
#create another data with duplicated rows
data = pd.DataFrame({'k1':['one']*3 + ['two']*4, 'k2':[3,2,1,3,3,4,4]})
data

<class 'TypeError'>: 'list' object is not callable

In [184]:
#sort values 
data.sort_values(by='k2')

<class 'KeyError'>: 'k2'

In [185]:
#remove duplicates - ta da! 
data.drop_duplicates()

<class 'TypeError'>: 'list' object is not callable

In [186]:
data.drop_duplicates(subset='k1')

<class 'KeyError'>: Index(['k1'], dtype='object')

In [187]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami','corned beef', 'Bacon', 'pastrami', 'honey ham','nova lox'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

	

<class 'TypeError'>: 'list' object is not callable

In [188]:
meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

def meat_2_animal(series):
    if series['food'] == 'bacon':
        return 'pig'
    elif series['food'] == 'pulled pork':
        return 'pig'
    elif series['food'] == 'pastrami':
        return 'cow'
    elif series['food'] == 'corned beef':
        return 'cow'
    elif series['food'] == 'honey ham':
        return 'pig'
    else:
        return 'salmon'


#create a new variable
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

<class 'KeyError'>: 'food'

In [189]:
#another way of doing it is: convert the food values to the lower case and apply the function
lower = lambda x: x.lower()
data['food'] = data['food'].apply(lower)
data['animal2'] = data.apply(meat_2_animal, axis='columns')
data

<class 'KeyError'>: 'food'

In [190]:
data.assign(new_variable = data['ounces']*10)

<class 'TypeError'>: 'list' object is not callable

In [191]:
data.drop('animal2',axis='columns',inplace=True)
data

<class 'KeyError'>: "['animal2'] not found in axis"

In [192]:
#Series function from pandas are used to create arrays
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data
0       1.0
1    -999.0
2       2.0
3    -999.0
4   -1000.0
5       3.0
dtype: float64


#replace -999 with NaN values
data.replace(-999, np.nan,inplace=True)
data
0       1.0
1       NaN
2       2.0
3       NaN
4   -1000.0
5       3.0
dtype: float64


#We can also replace multiple values at once.
data = pd.Series([1., -999., 2., -999., -1000., 3.])
data.replace([-999,-1000],np.nan,inplace=True)
data
0    1.0
1    NaN
2    2.0
3    NaN
4    NaN
5    3.0
dtype: float64

<class 'SyntaxError'>: invalid syntax (<ipython-input-192-be8cd6cefadc>, line 4)

In [193]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),index=['Ohio', 'Colorado', 'New York'],columns=['one', 'two', 'three', 'four'])
data

<class 'TypeError'>: 'list' object is not callable

In [194]:
#Using rename function
data.rename(index = {'Ohio':'SanF'}, columns={'one':'one_p','two':'two_p'},inplace=True)
data

<class 'TypeError'>: 'list' object is not callable

In [195]:
#You can also use string functions
data.rename(index = str.upper, columns=str.title,inplace=True)
data

<class 'TypeError'>: descriptor 'upper' for 'str' objects doesn't apply to a 'int' object

In [196]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]

In [197]:
#Understand the output - '(' means the value is included in the bin, '[' means the value is excluded
bins = [18, 25, 35, 60, 100]
cats = pd.cut(ages, bins)
cats
[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (25, 35]]
Length: 12
Categories (4, object): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]


#To include the right bin value, we can do:
pd.cut(ages,bins,right=False)
[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, object): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]


#pandas library intrinsically assigns an encoding to categorical variables.
cats.labels
array=([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 1], dtype=int8)


#Let's check how many observations fall under each bin
pd.value_counts(cats)
(18, 25]     5
(35, 60]     3
(25, 35]     3
(60, 100]    1
dtype: int64

<class 'SyntaxError'>: closing parenthesis ']' does not match opening parenthesis '(' (<ipython-input-197-c19ef086020e>, line 5)

In [198]:
bin_names = ['Youth', 'YoungAdult', 'MiddleAge', 'Senior']
new_cats = pd.cut(ages, bins,labels=bin_names)

pd.value_counts(new_cats)

<class 'NameError'>: name 'bins' is not defined

In [199]:
#we can also calculate their cumulative sum
pd.value_counts(new_cats).cumsum()

<class 'NameError'>: name 'new_cats' is not defined

In [200]:
df = pd.DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                   'key2' : ['one', 'two', 'one', 'two', 'one'],
                   'data1' : np.random.randn(5),
                   'data2' : np.random.randn(5)})
df

<class 'TypeError'>: 'list' object is not callable

In [201]:
#calculate the mean of data1 column by key1
grouped = df['data1'].groupby(df['key1'])
grouped.mean()
key1
a    0.595757
b    1.019769
Name: data1, dtype: float64

<class 'SyntaxError'>: invalid syntax (<ipython-input-201-671f11c0e669>, line 5)

In [202]:
dates = pd.date_range('20130101',periods=6)
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))
df

<class 'TypeError'>: 'list' object is not callable

In [203]:
#get first n rows from the data frame
df[:3]

<class 'TypeError'>: 'list' object is not callable

In [204]:
#slice based on date range
df['20130101':'20130104']

<class 'TypeError'>: 'list' object is not callable

In [205]:
#slicing based on column names
df.loc[:,['A','B']]

<class 'TypeError'>: 'list' object is not callable

In [206]:
#slicing based on both row index labels and column names
df.loc['20130102':'20130103',['A','B']]

<class 'TypeError'>: 'list' object is not callable

In [207]:
#slicing based on index of columns
df.iloc[3] #returns 4th row (index is 3rd)
A    0.918203
B   -0.158800
C   -0.964063
D   -1.990779
Name: 2013-01-04 00:00:00, dtype: float64


#returns a specific range of rows
df.iloc[2:4, 0:2]

<class 'SyntaxError'>: leading zeros in decimal integer literals are not permitted; use an 0o prefix for octal integers (<ipython-input-207-dfca5cd9f494>, line 7)

In [208]:
#returns specific rows and columns using lists containing columns or row indexes
df.iloc[[1,5],[0,2]] 

<class 'TypeError'>: 'list' object is not callable

In [209]:
df[df.A > 1]

<class 'TypeError'>: 'list' object is not callable

In [210]:
#we can copy the data set
df2 = df.copy()
df2['E']=['one', 'one','two','three','four','three']
df2

<class 'TypeError'>: 'list' object is not callable

In [211]:
#select rows based on column values
df2[df2['E'].isin(['two','four'])]

<class 'KeyError'>: 'E'

In [212]:
#select all rows except those with two and four
df2[~df2['E'].isin(['two','four'])]

<class 'KeyError'>: 'E'

In [213]:
#list all columns where A is greater than C
df.query('A > C')

<class 'TypeError'>: 'list' object is not callable

In [214]:
#using OR condition
df.query('A < B | C > A')

<class 'TypeError'>: 'list' object is not callable

In [215]:
#create a data frame
data = pd.DataFrame({'group': ['a', 'a', 'a', 'b','b', 'b', 'c', 'c','c'],
                 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

<class 'TypeError'>: 'list' object is not callable

In [216]:
#calculate means of each group
data.pivot_table(values='ounces',index='group',aggfunc=np.mean)
group
a    6.333333
b    7.166667
c    4.666667
Name: ounces, dtype: float64


#calculate count by each group
data.pivot_table(values='ounces',index='group',aggfunc='count')
group
a    3
b    3
c    3
Name: ounces, dtype: int64

<class 'SyntaxError'>: invalid syntax (<ipython-input-216-70429898c07a>, line 4)