# Table of Contents
 <p><div class="lev1 toc-item"><a href="#1-preparing-data" data-toc-modified-id="1-preparing-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>1 preparing data</a></div><div class="lev2 toc-item"><a href="#1.1-reading-multiple-data-files" data-toc-modified-id="1.1-reading-multiple-data-files-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>1.1 reading multiple data files</a></div><div class="lev2 toc-item"><a href="#1.2-reindex-dataframes" data-toc-modified-id="1.2-reindex-dataframes-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>1.2 reindex dataframes</a></div><div class="lev2 toc-item"><a href="#1.3-arithmetic-between-series-&amp;-dataframes" data-toc-modified-id="1.3-arithmetic-between-series-&amp;-dataframes-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>1.3 arithmetic between series &amp; dataframes</a></div><div class="lev1 toc-item"><a href="#2-Concatenating-data" data-toc-modified-id="2-Concatenating-data-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>2 Concatenating data</a></div><div class="lev2 toc-item"><a href="#2.1-appending--&amp;concatenating-series" data-toc-modified-id="2.1-appending--&amp;concatenating-series-21"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>2.1 appending  &amp;concatenating series</a></div><div class="lev2 toc-item"><a href="#2.2-appending-&amp;-concatenating-dataframes" data-toc-modified-id="2.2-appending-&amp;-concatenating-dataframes-22"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>2.2 appending &amp; concatenating dataframes</a></div><div class="lev2 toc-item"><a href="#2.3-concatenation,-keys,-multiIndexes" data-toc-modified-id="2.3-concatenation,-keys,-multiIndexes-23"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>2.3 concatenation, keys, multiIndexes</a></div><div class="lev2 toc-item"><a href="#2.4-outer-&amp;-inner-joins" data-toc-modified-id="2.4-outer-&amp;-inner-joins-24"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>2.4 outer &amp; inner joins</a></div><div class="lev1 toc-item"><a href="#3-joining-and-merging-dataframes" data-toc-modified-id="3-joining-and-merging-dataframes-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>3 joining and merging dataframes</a></div>

# 1 preparing data

## 1.1 reading multiple data files

In [None]:
# pd.read_系列
pd.read_csv(filepath)
pd.read_excel(filepath)
pd.read_html(filepath)
pd.read_json(filepath)

In [None]:
# load files seperatly
df0 = pd.read_csv(filepath0)
df1 = pd.read_csv(filepath1)

# use a loop (1)
files = [filepath0, filepath1]

dataframes = []

for f in files:
    dataframes.append(pd.read_csv(f))
    
pd.concat(dataframes)
    
# use a loop (2)
files = [filepath0, filepath1]

dataframes = [pd.read_csv(f) for f in files]  # get a list of df

pd.concat(dataframes)  # merge them into one df

# import glob to use wildcard
from glob import glob

filenames = glob('sales*.csv')
dataframes = [pd.read_csv(f) for f in filenames]

pd.concat(dataframes)

## 1.2 reindex dataframes

In [120]:
# two dataframes
w_mean = pd.DataFrame({'Mean TemperatureF':[61.956044, 
                                            32.133333, 
                                            68.934783, 
                                            43.434783],
                       'Month':['Apr', 'Jan', 'Jul', 'Oct']}).set_index('Month')

w_mean.index    # index is in alphabetical order

Index(['Apr', 'Jan', 'Jul', 'Oct'], dtype='object', name='Month')

In [121]:
# two dataframes
w_max = pd.DataFrame({'Max TemperatureF':[68, 
                                            89, 
                                            91, 
                                            84],
                       'Month':['Jan', 'Apr',  'Jul', 'Oct']}).set_index('Month')

w_max.index    # index is in chronological order

Index(['Jan', 'Apr', 'Jul', 'Oct'], dtype='object', name='Month')

In [None]:
# using .reindex() to change order into one you desire
ordered = ['Jan', 'Apr', 'Jul', 'Oct']

w_mean.reindex(ordered) == w_mean.reindex(w_max.index)  # get element-wise true
                                                        # both chronological order
    
w_mean2 = w_mean.reindex(ordered)    # get chronological order

w_mean2.sort_index()    # .sort_index() sorts back in alphabetical order

In [123]:
# reindex with missing labels
w_mean3 = w_mean.reindex(['Jan', 'Apr', 'Dec'])
w_mean3

w_mean3.dropna()    # then can call .dropna() to drop the NaN row
# or
w_mean3.ffill()     # fill missing vals forward or backward
w_mean3.bfill()

Unnamed: 0_level_0,Mean TemperatureF
Month,Unnamed: 1_level_1
Jan,32.133333
Apr,61.956044
Dec,


## 1.3 arithmetic between series & dataframes

* .divide offers more flexibility   
虽然不是很能理解 .divide(..., axis='row')

![](http://upload-images.jianshu.io/upload_images/1526845-40446af0ce993464.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

* .add() offers more flexibility

![](http://upload-images.jianshu.io/upload_images/1526845-a7959182a6161f80.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)

# 2 Concatenating data

## 2.1 appending  &concatenating series

In [None]:
# .append()  stacking rows vertically
s1.append(s2)

# pd.concat()  can stack vertically or horizontally
pd.concat([s1, s2, s3])

# by default, these two method keep index of all original series
# so there will be occasions where the same one index responding to two vals
# to drop index when appending & concatenating
s1.append(s2).reset_index(drop=True)

pd.concat([s1, s2], ignore_index=True)

## 2.2 appending & concatenating dataframes

In [None]:
# .append()  stack vertically
df1.append(df2)  # when diff cols, fill missing val with NaN

# pd.concat()  can stack both direction, fill missing val with NaN 
# by default concat on cols with same name (as many as possible)
pd.concat([s1, s2], axis=0)  # stack horizontally/concat by rows
pd.concat([s1, s2], axis=1)  # stack vertically/concat by columns

## 2.3 concatenation, keys, multiIndexes

In [None]:
# keys=[] specifying the source df, 
# causing multi level indx
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis=0)

# causing multi level columns
rain1314 = pd.concat([rain2013, rain2014], keys=[2013, 2014], axis='columns')

# source-df pairs as key-value pais
rain_dict = {2013: rain2013, 2014: rain2014}
rain1314 = pd.concat(rain_dict, axis='columns')


In [None]:
# slicing with multi index df

    # one way, use slice()
df.loc[ (slice(None), slice(indxB1, indxB2)),
       : ]  # all outer index level
            # and some of the inner index level
    
    # another, use pd.IndexSlice
idx = pd.IndexSlice
df.loc[ idx[:， indxB], idx[colA] ] # all outer index levels
                                    # one inner index level (indxB)

## 2.4 outer & inner joins

In [None]:
# pd.concat() by default do an outer join
pd.concat([df1, df2], axis=1, join='inner')

pd.concat([df1, df2], axis=1, join='outer')

# 3 joining and merging dataframes

In [None]:
# .join and pd.merge() both concat by columns/stack horiontally

# .join
df1.join(df2)    # by default left join (how='left')
                 # **join on index**
    
df1.join(df2, how='right')
df1.join(df2, how='outer')
df1.join(df2, how='inner')


# pd.merge()
pd.merge(df1, df2)   # by default an inner join
                     # by default merge on same-name-cols as many as possible

pd.merge(bronze, gold, on='NOC')

pd.merge(bronze, gold, 
         on=['NOC', 'Country'])  # vals of the cols have to be 
                                 # the same to join other cols

pd.merge(bronze, gold, 
         on=['NOC', 'Country'], 
         suffixes=['_bronze', '_gold'])  # specify the source

pd.merge(cities, counties, 
         left_on='City', right_on='CITY NAME') # diff colname with same content

pd.merge(bronze, gold, on=['NOC', 'Country'],
         suffixes=['_bronze', '_gold'], 
         how='inner')


![](http://upload-images.jianshu.io/upload_images/1526845-b752d7372a7dcb0f.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)