In [1]:
import pandas as pd
import numpy as np

# 1. Data Structures

## 1.1 Series 

pandas.Series is one-dimensional **ndarray** with **axis labels**. So it acts like an **ndarray** and a **dict**.

### 1.1.1 Create Series 

In [2]:
# Construct function
pd.Series(data=None, index=None, dtype=None, name=None, copy=False)
# Parameter data: array-like, Iterable, dict, scalar
# Parameter index: array-like or Index(1d)

# create Series from ndarray
s = pd.Series(np.random.randn(5))
print(s)

# create Series from array
s = pd.Series([2, 3, 1, 4, 5], index = ['a', 'b', 'c', 'd', 'e'])
print(s)

# create Series from dict
s = pd.Series({'a': 1.0, 'b': 2.0, 'c': 3.0}) # key as the index
print(s)
s = pd.Series({'a': 1.0, 'b': 2.0, 'c': 3.0}, index=['x', 'y', 'a', 'a']) # Non-unique index values are allowed.
# If an index is passed, the values in data corresponding to the labels in the index will be pulled out.
# Return NaN if no key corresponds to the index label.
print(s)

# create Series from scalar
s = pd.Series(5, index=['a', 'b', 'c'])
print(s)


0    0.764189
1    0.500157
2   -1.102028
3    0.083789
4   -0.059464
dtype: float64
a    2
b    3
c    1
d    4
e    5
dtype: int64
a    1.0
b    2.0
c    3.0
dtype: float64
x    NaN
y    NaN
a    1.0
a    1.0
dtype: float64
a    5
b    5
c    5
dtype: int64


### 1.1.2 Series is ndarray-like 

Series acts very similarly to a ndarray and is a valid argument to most Numpy functions. And operations such as slicing will also slice the index.

In [3]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'], name='mySeries')
print(s[:3])
print(s['b':'d']) # different from number slicing, index slicing includes both start point and end point.
print(s[s > s.median()])

a    1.478949
b   -2.369137
c    0.344309
Name: mySeries, dtype: float64
b   -2.369137
c    0.344309
d    1.584611
Name: mySeries, dtype: float64
a    1.478949
d    1.584611
Name: mySeries, dtype: float64


Different from ndarray, operations between Series automatically align the data based on label. Thus, you can write computations without giving consideration to whether the Series involved have the same labels. If a label is not found in one Series or the other, the result will be marked as missing NaN.

In [4]:
s[1:] + s[:-1]

a         NaN
b   -4.738274
c    0.688618
d    3.169222
e         NaN
Name: mySeries, dtype: float64

While Series is ndarray-like, if you need an actual ndarray, use Series.to_numpy()

In [5]:
s.to_numpy()

array([ 1.47894855, -2.36913715,  0.3443091 ,  1.58461078, -0.73458289])

### 1.1.3 Series is dict-like 

In [6]:
print('a' in s)
print(s['a'])
print(s.get('a'))
print('f' in s)
# print(s['f']) # error
print(s.get('f'))
print(s.get('f', np.nan))

True
1.4789485491624785
1.4789485491624785
False
None
nan


## 1.2 DataFrame

pandas.DataFrame is a two-dimensional, size-multable, potentially heterogeneous tabular data. It contains labeled axes (rows and columns). Can be thought of as a dict-like container for Series objects. The primary pandas data structure.

### 1.2.1 Create DataFrame 

In [7]:
# Construct function
pd.DataFrame(data=None, index=None, columns=None, dtype=None, copy=False)
# Parameter data: ndarry(structured or homogeneous), Iterable, dict, or DataFrame

# create DataFrame from dict of Series
df = pd.DataFrame({'col1': pd.Series([1.0, 2.0, 3.0], index=['a', 'b', 'c']),
                  'col2': pd.Series([4.0, 5.0, 6.0], index=['a', 'b', 'd'])})
print(df)

# create DataFrame from dict of ndarrays/lists
df = pd.DataFrame({'col1':[1, 2, 3], 'col2':[4, 5, 6]}, index=['a', 'b', 'c'])
print(df)

# Optional
# create DataFrame from a list of dicts.
df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 4, 'b': 5, 'c': 6}])
print(df)

df = pd.DataFrame([{'a': 1, 'b': 2}, {'a': 4, 'b': 5, 'c': 6}], columns=['col1', 'col2'])
print(df)

# To summarize, keys in the dict data are for columns

   col1  col2
a   1.0   4.0
b   2.0   5.0
c   3.0   NaN
d   NaN   6.0
   col1  col2
a     1     4
b     2     5
c     3     6
   a  b    c
0  1  2  NaN
1  4  5  6.0
   col1  col2
0   NaN   NaN
1   NaN   NaN


### 1.2.2 Read and write DataFrame

 ### 1. ***pd.read_csv()***, read csv file & text file, return DataFrame

**Parameters:**  
**filepath_or_buffer**: filepath, str, url, or any object with a read() method (such as an open file or StringIO)

**sep / delimiter**: sep, default ','; delimiter, default None

**header**: default 'infer'. header=0: infer from first line. header=None: column names are passed. header='infer': if column names are passed, like header=None, else like header = 0. Note that this parameter ignores commented lines and empty lines if skip_blank_lines=True(default), so header=0 denotes the first line of data rather than the first line of the file.
**names**: array-like, default None. Column names.
**index_col**: int, str, sequence of int/str. Column(s) to use as row labels.

**usecols**: list-like or callable, default None. Return a subset of columns.
**skip_rows**: list-like or integer, default None. Line numbers to skip(0-indexed) or number of lines to skip (int) at the start of the file.
**nrows**: int, defatul None. Number of rows of file to read.

**true_values**: list, default None. Values to consider as True.
**false_values**: list, default None. Values to consider as False.
**na_values**: list and others, defaule None. Additional strings to recognize as NA/NaN.
**keep_default_na**: boolean, defatult True. Whether or not to include the default NaN values when parsing the data.

**parse_dates**: boolean or list of ints or names or list of lists or dict, default False.  
- If True: try parsing the index.  
- If [1, 2, 3]:try parsing columns 1,2,3 each as a separate data column. 
- If [[1, 3]]: combine columns 1 and 3 and parse as a single data column.
**date_format**: str or dict of column -> format, default None. If used in conjuction with parse_dates, will parse dates according to this format. For anything more complex, please read in as object and then apply to_datetime() as-needed.

**dtype**: indicate the date type for the whole DataFrame or individual columns.
**converters**: deal with columns with mixed types.

In [8]:
from io import BytesIO
data = b"word,length\n" b"Tr\xc3\xa4umen,7\n" b"Gr\xc3\xbc\xc3\x9fe,5"
df = pd.read_csv(BytesIO(data), encoding='utf-8') # By test, adding encoding='utf-8' or not won't affect the result
df

Unnamed: 0,word,length
0,Träumen,7
1,Grüße,5


### 2. to_csv(), Series/DataFrame instance method, store objects to csv/text file
**Parameters:**  
**path_or_buf**: a string path to the file or a file object.  
**sep**: Field delimiter, default ','.

In [9]:
from io import StringIO
with StringIO() as f:
    df.to_csv(f)
    f.seek(0)
    print(f.read())

,word,length
0,Träumen,7
1,Grüße,5



### 1.2.3 Explore DataFrame data

In [10]:
s = pd.Series(np.random.randn(5), index=["a", "b", "c", "d", "e"])
df = pd.DataFrame(np.random.randn(8, 3), index=list('abcdefgh'), columns=["A", "B", "C"])

In [11]:
df.head() # display 5 head rows
df.head(10) # display 10 head rows
df.tail() # display 5 tail rows
df.tail(10) # display 10 tail rows

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028
d,0.833223,0.224238,-0.598748
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
g,0.003067,0.076233,-0.330683
h,-0.348789,0.132711,0.023626


In [12]:
df.shape

(8, 3)

In [13]:
df.index
df.columns

Index(['A', 'B', 'C'], dtype='object')

In [14]:
s.describe()
df.describe()

Unnamed: 0,A,B,C
count,8.0,8.0,8.0
mean,0.065974,-0.048158,-0.25783
std,0.956543,0.998542,0.508653
min,-1.45356,-1.586865,-1.189951
25%,-0.473522,-0.724136,-0.468458
50%,0.129061,0.104472,-0.267207
75%,0.597303,0.446155,0.07719
max,1.567858,1.310514,0.423992


# 2. Indexing and selecting data 

In [15]:
df

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028
d,0.833223,0.224238,-0.598748
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
g,0.003067,0.076233,-0.330683
h,-0.348789,0.132711,0.023626


## 2.1 []

In [16]:
df['A']

a    0.255054
b    1.567858
c    0.518663
d    0.833223
e   -1.453560
f   -0.847722
g    0.003067
h   -0.348789
Name: A, dtype: float64

In [17]:
# df['a'] # KeyError

In [18]:
df.A

a    0.255054
b    1.567858
c    0.518663
d    0.833223
e   -1.453560
f   -0.847722
g    0.003067
h   -0.348789
Name: A, dtype: float64

In [19]:
df['A']['a']

0.25505444403606237

In [37]:
df['A'][0]

0.25505444403606237

## 2.2 .loc 

**Label based**, but may laso be used with a boolean array. ***.loc*** will raise KeyError when the items are not found. Allowed inputs are:  
- A single label, e.g. 5 or 'a' (Note that 5 is interpreted as a label of the index. This use is not an integer position along the index).  
- A list or array of labels ['a', 'b', 'c'].  
- A slice object with labels ['a':'f'] (Note that contrary to usual Python slices, both the start and the stop are included, when present in the index.)   
- A boolean array (any NA values will be treated as False).  
- A callable function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above).  
- For multi-axes selection, axes left out of the specification are assumed to be **:**, e.g. p.loc['a'] is equivalent to p.loc['a', :].

In [20]:
# df.loc[3] # KeyError
df.loc['a']

A    0.255054
B   -0.621269
C    0.237882
Name: a, dtype: float64

In [21]:
# df.loc['A'] # KeyError
df.loc[:, 'A'] # same with df['A']

a    0.255054
b    1.567858
c    0.518663
d    0.833223
e   -1.453560
f   -0.847722
g    0.003067
h   -0.348789
Name: A, dtype: float64

In [22]:
df.loc['a', 'B':'C']

B   -0.621269
C    0.237882
Name: a, dtype: float64

In [23]:
df.loc['a':'b'] # same with df.loc['a':'b', :]

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992


In [24]:
df.loc['a':'z']

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028
d,0.833223,0.224238,-0.598748
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
g,0.003067,0.076233,-0.330683
h,-0.348789,0.132711,0.023626


In [25]:
# Selection by boolean array
print(df.loc[:, 'A'] < 0)
df.loc[df.loc[:, 'A'] < 0] # same with df.log[df.loc[:, 'A'] < 0, :]

a    False
b    False
c    False
d    False
e     True
f     True
g    False
h     True
Name: A, dtype: bool


Unnamed: 0,A,B,C
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
h,-0.348789,0.132711,0.023626


In [26]:
mask = pd.array([True, False, True, False, True, False, True, pd.NA])
print(mask)
df.loc[pd.array(mask)]

<BooleanArray>
[True, False, True, False, True, False, True, <NA>]
Length: 8, dtype: boolean


Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
c,0.518663,-1.032737,-0.425028
e,-1.45356,1.310514,-0.203731
g,0.003067,0.076233,-0.330683


In [27]:
# Selection by callable
df.loc[lambda df: df['A'] < 0] # callable returns a boolean array

Unnamed: 0,A,B,C
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
h,-0.348789,0.132711,0.023626


In [30]:
df.loc[:, lambda df: ['A', 'C']] # callable returns a list of labels

Unnamed: 0,A,C
a,0.255054,0.237882
b,1.567858,0.423992
c,0.518663,-0.425028
d,0.833223,-0.598748
e,-1.45356,-0.203731
f,-0.847722,-1.189951
g,0.003067,-0.330683
h,-0.348789,0.023626


In [34]:
df.loc[lambda df:list('abcde'), :] # callable returns a list of labels

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028
d,0.833223,0.224238,-0.598748
e,-1.45356,1.310514,-0.203731


In [36]:
# return an element
df.loc['a']['C']

0.23788194779252822

## 2.3 .iloc 

**Integer position based** (from 0 to length-1 of the axis), but may also be used with a boolean array. ***.iloc*** will raise IndexError if a requested indexer is out-of-bounds, except slice indexers which allow out-of-bounds indexing (this conforms with Python/NumPy slice semantics). Allowed inputs are (similar to ***.loc***):  
- An integer e.g. 5.
- A list or array of integers [4, 3, 0]
- A slice object with ints 1:7.
- A boolean array (any NA values will be treated as False).
- A callable function with one argument (the calling Series or DataFrame) and that returns valid output for indexing (one of the above).
- For multi-axes selection, axes left out of the specification are assumed to be :, e.g. p.loc['a'] is equivalent to p.loc['a', :].

In [39]:
df

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028
d,0.833223,0.224238,-0.598748
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
g,0.003067,0.076233,-0.330683
h,-0.348789,0.132711,0.023626


In [38]:
df.iloc[2]

A    0.518663
B   -1.032737
C   -0.425028
Name: c, dtype: float64

In [42]:
df.iloc[[5, 3, 1]]

Unnamed: 0,A,B,C
f,-0.847722,-1.586865,-1.189951
d,0.833223,0.224238,-0.598748
b,1.567858,1.111907,0.423992


In [45]:
df.iloc[3:5, 1:]

Unnamed: 0,B,C
d,0.224238,-0.598748
e,1.310514,-0.203731


In [50]:
df.iloc[:, [True, False, True]]

Unnamed: 0,A,C
a,0.255054,0.237882
b,1.567858,0.423992
c,0.518663,-0.425028
d,0.833223,-0.598748
e,-1.45356,-0.203731
f,-0.847722,-1.189951
g,0.003067,-0.330683
h,-0.348789,0.023626


In [58]:
print(df.B > 0)
# df.iloc[df.B > 0] #ValueError: iLocation based boolean indexing cannot use an indexable as a mask
mask = pd.array([True, False, True, False, True, False, pd.NA, pd.NA])
print(mask)
df.iloc[mask]

a    False
b     True
c    False
d     True
e     True
f    False
g     True
h     True
Name: B, dtype: bool
<BooleanArray>
[True, False, True, False, True, False, <NA>, <NA>]
Length: 8, dtype: boolean


Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
c,0.518663,-1.032737,-0.425028
e,-1.45356,1.310514,-0.203731


In [59]:
# Selection by callable
df.iloc[lambda df: [4, 5, 6]]

Unnamed: 0,A,B,C
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
g,0.003067,0.076233,-0.330683


In [60]:
df.iloc[lambda df: [4, 5, 6], lambda df: [2, 1]]

Unnamed: 0,C,B
e,-0.203731,1.310514
f,-1.189951,-1.586865
g,-0.330683,0.076233


## 2.4 Selecting random samples 

Use ***sample()*** method to select random rows or columns from a Series or DataFrame. 
  
***DataFrame.sample(n=1, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)***    
**Parameters:**
- **n**: int, optional. Number of items from axis to return. Default = 1.
- **frac**: float, optional. Fraction of axis items to return.
- **replace**: bool, default False. With or without replacement. Allow or disallow sampling of the same row more than once.
- **weights**: str or ndarray-like, optional. Default 'None' results in equal probability weighting. If called on DataFrame and axis=0, can accept the name of a column as the weights array. Otherwise, accept ndarray-like. If passed a Series as weights, will align with target object on index. Index values in weights not found in sampled object will be ignored and index values in sampled object not in weights will be assigned weights of zero. Unless weights are a Series, weights must be same length as axis being sampled. If weights do not sum to 1, they will be normalized to sum to 1. 
- **random_state**: int (as a seed) or a Numpy RandomState object. Random number generator.
- **axis**: 0 or 'index', 1 or 'columns'. Axis to sample. Default is index for DataFrame.

In [61]:
df

Unnamed: 0,A,B,C
a,0.255054,-0.621269,0.237882
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028
d,0.833223,0.224238,-0.598748
e,-1.45356,1.310514,-0.203731
f,-0.847722,-1.586865,-1.189951
g,0.003067,0.076233,-0.330683
h,-0.348789,0.132711,0.023626


In [63]:
df.sample()

Unnamed: 0,A,B,C
g,0.003067,0.076233,-0.330683


In [64]:
df.sample(3, random_state=4)

Unnamed: 0,A,B,C
e,-1.45356,1.310514,-0.203731
h,-0.348789,0.132711,0.023626
d,0.833223,0.224238,-0.598748


In [66]:
df.sample(3, random_state=4)

Unnamed: 0,A,B,C
e,-1.45356,1.310514,-0.203731
h,-0.348789,0.132711,0.023626
d,0.833223,0.224238,-0.598748


In [67]:
df.sample(frac=0.5)

Unnamed: 0,A,B,C
d,0.833223,0.224238,-0.598748
h,-0.348789,0.132711,0.023626
b,1.567858,1.111907,0.423992
c,0.518663,-1.032737,-0.425028


In [73]:
df.sample(n=3, axis=1, replace=True, weights=[1,0,0])

Unnamed: 0,A,A.1,A.2
a,0.255054,0.255054,0.255054
b,1.567858,1.567858,1.567858
c,0.518663,0.518663,0.518663
d,0.833223,0.833223,0.833223
e,-1.45356,-1.45356,-1.45356
f,-0.847722,-0.847722,-0.847722
g,0.003067,0.003067,0.003067
h,-0.348789,-0.348789,-0.348789


# 3. Merge, join, concatenate

We use ***pandas.concat*** to concatenate pandas objects along a paticular axis and allow optional set logic along the other axes. Can also add a layer of hierarchical indexing on the concatenation axis, which may be useful if the labels are the same (or overlapping) on the passed axis number.  
***pandas.concat(objs, *, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=None)***  
**Parameters:**
- **objs**: a sequence or mapping of Series or DataFrame objects.
- **axis**: {0/'index', 1/'columns'}, default 0
- **join**: {'inner', 'outer'}, default 'outer'
- **ignore_index**: bool, default False. If True, do not use the index values along the concatenation axis. The resulting axis will be labeled 0,...,n-1.
- **keys**: sequence, default None. Construct hierarchical index using the passed keys as the outermost level.  
  
**Returns:**  
When concatenating all Series along the index (axis=0), a Series is returned. When objs contains at least one DataFrame, a DataFrame is returned. When concatenating along the columns (axis=1), a DataFrame is returned.

In [78]:
df1 = pd.DataFrame({
    'A': ['A0', 'A1', 'A2', 'A3'],
    'B': ['B0', 'B1', 'B2', 'B3'],
    'C': ['C0', 'C1', 'C2', 'C3']}, index=[0, 1, 2, 3])
df2 = pd.DataFrame({
    'A': ['A4', 'A5', 'A6'],
    'B': ['B4', 'B5', 'B6'],
    'C': ['C4', 'C5', 'C6']}, index=[4, 5, 6])
df3 = pd.DataFrame({
    'B': ['B2', 'B3', 'B6'],
    'D': ['D2', 'D3', 'D6'],
    'F': ['F2', 'F3', 'F6']}, index=[2, 3, 6])

In [79]:
df1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3


In [80]:
df2

Unnamed: 0,A,B,C
4,A4,B4,C4
5,A5,B5,C5
6,A6,B6,C6


In [83]:
pd.concat([df1, df2])

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4
5,A5,B5,C5
6,A6,B6,C6


In [84]:
pd.concat([df1, df2], axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
0,A0,B0,C0,,,
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,A3,B3,C3,,,
4,,,,A4,B4,C4
5,,,,A5,B5,C5
6,,,,A6,B6,C6


In [85]:
df3

Unnamed: 0,B,D,F
2,B2,D2,F2
3,B3,D3,F3
6,B6,D6,F6


In [89]:
pd.concat([df1, df3], join='inner', axis=1)

Unnamed: 0,A,B,C,B.1,D,F
2,A2,B2,C2,B2,D2,F2
3,A3,B3,C3,B3,D3,F3


In [101]:
result = pd.concat([df1, df3], axis=1, join='inner', keys=['x', 'y'])
result

Unnamed: 0_level_0,x,x,x,y,y,y
Unnamed: 0_level_1,A,B,C,B,D,F
2,A2,B2,C2,B2,D2,F2
3,A3,B3,C3,B3,D3,F3


In [102]:
result = pd.concat({'x': df1, 'y': df3}, axis=1, join='inner') # the same with above cell
result

Unnamed: 0_level_0,x,x,x,y,y,y
Unnamed: 0_level_1,A,B,C,B,D,F
2,A2,B2,C2,B2,D2,F2
3,A3,B3,C3,B3,D3,F3


In [94]:
result.x.loc[2, 'B']

'B2'

In [104]:
df1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3


In [108]:
# Append rows to a DataFrame
s2 = pd.Series(['A4', 'B4', 'C4'], index=['A', 'B', 'C'])
pd.concat([df1, s2.to_frame().T], ignore_index=True)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2
3,A3,B3,C3
4,A4,B4,C4


# 4. Group by: split-apply-combine