In [1]:
import pandas as pd
import numpy as np

In [24]:
%run 11.ipynb

Lecture 1 Notes: Numpy arrays / operations

In [2]:
# create a numpy arrary from scratch

a = np.zeros(10, dtype=int)
b = np.ones((2,3), dtype=int) # 2x3 matrix of ones
c = np.full((2,3),3.14) # matrix size and specific number to fill it with
d = np.arange(0,5,2) # range from 0 to 5, step size is 2
e = np.linspace(0,2,5) # start from 0 to 2, have 5 numbers equally spaced between 0 and 2
f = np.eye(3) # 3x3 identity matrix
g = np.empty((2,3)) # 2 by 3 matrix - values will be whatever is at that memory location
g.size # total elements in a numpy array - g
g.shape # dimension of a numpy array - g
g.ndim

# Random integers in the interval [0, 10) (3 X 3 array)
np.random.randint(0, 10, (3,3))

vect = np.random.uniform(10, 90, 15).astype(int)

### Array slicing  x[start:stop:step] -> for multi-dimensional sepearte with a comma x[:2, :3]

g[:1,:2]

### concatenate
add = np.concatenate([b,c])

### splitting
x = [1, 2, 3, 99, 99, 3, 2, 1]
x1, x2, x3 = np.split (x, [3,5])
a

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Lecture 1 practice

In [60]:
# Question 1
p = np.random.random((3,4))
o = np.full((3,4), 1)
np.concatenate([p,o])

# Question 2
first = np.arange(2,11)
second = np.random.randint(25,100, (9))
third = first + second
third.mean()
third.sum()
third.min()
third.max()

99

Lecture 2 Notes: Pandas Dataframes, Indexing, Filtering, Selecting

In [85]:
population_dict = {'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860}
population = pd.Series ( population_dict)

population['Texas':'Florida']

# data.loc[data.density > 100, ['population', 'area']] rows where density is > 100, and select columns population and area

my_list  = [2, 3, '2', 5,'me']
index = ['i', 'j', 'k', 2, 3]
data = pd.Series(my_list , index)


df = pd.DataFrame(np.random.rand (3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])
df.index
df.columns
df.values



array([[0.58421716, 0.17567651],
       [0.81689102, 0.65718551],
       [0.2108819 , 0.92063803]])

Lecture 3 Notes: Data Profiling / Cleaning, 
Handling Missing Values

In [96]:
df.head(2)
df.tail(2)
df.info()
df.sample(2)
df.describe()
df.dropna(subset=['foo'], axis=0, inplace=True)
df.foo.fillna(1, inplace=True)
df.foo.replace(np.nan, 0, inplace=True)

<class 'pandas.core.frame.DataFrame'>
Index: 3 entries, a to c
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   foo     3 non-null      float64
 1   bar     3 non-null      float64
dtypes: float64(2)
memory usage: 180.0+ bytes


Lecture 4 Notes: Data preparation: Duplicates, filtering, binning, transformation

In [114]:
# Duplicates / replacing
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
                     'k2': [1, 1, 2, 3, 3, 4, 4]})
data.duplicated()
data.drop_duplicates(['k1','k2'], inplace=True)
data.replace([1, 4], ['h', 'hello'], inplace=True)

# Binning
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18,25,35,60,100]
cats = pd.cut(ages,bins, right=False)
cats


[[18, 25), [18, 25), [25, 35), [25, 35), [18, 25), ..., [25, 35), [60, 100), [35, 60), [35, 60), [25, 35)]
Length: 12
Categories (4, interval[int64, left]): [[18, 25) < [25, 35) < [35, 60) < [60, 100)]

Lecture 5 Notes: Aggregation and Transformation

In [13]:
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data': range(6)}, columns=['key', 'data'])

# groupby and aggregation
df.groupby('key').aggregate(['min', np.median, 'max', 'sum'])

# filtering
rng = np.random.RandomState(0)
df = pd.DataFrame({'key': ['A', 'B', 'C', 'A', 'B', 'C'],
                   'data1': range(6),
                   'data2': rng.randint(0, 10, 6)},
                   columns = ['key', 'data1', 'data2'])

def filter_func(x):
    return x['data2'].std() > 4

df.groupby('key').filter(filter_func)

  df.groupby('key').aggregate(['min', np.median, 'max', 'sum'])


Unnamed: 0,key,data1,data2
1,B,1,0
2,C,2,3
4,B,4,7
5,C,5,9


Lecture 6 and 7 Notes:Data concatenation and string functions , Manipulating Time

In [35]:
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c','d', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])

pd.concat([s1, s2, s3], axis=1)


monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 'Eric Idle', 'Terry Jones', 'Michael Palin'])
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

Lec 8

In [42]:
index = [('California', 2000), ('California', 2010), ('New York', 2000), ('New York', 2010), ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956, 18976457,19378102, 20851820, 25145561]
pop = pd.Series(populations, index=index)
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
pop.unstack()

Unnamed: 0,2000,2010
California,33871648,37253956
New York,18976457,19378102
Texas,20851820,25145561
