* 0 pandas basics
* 1 data ingestion & inspection
    * 1.1 building dataframes from scratch
    * 1.2 importing & exporting data
    * 1.3 plotting with pandas
* 2 exploratory data analysis
    * 2.1 visual explotatory data analysis (numerical vals)
    * 2.2 statistical exploratory data analysis (for numerical columns )
    * 2.3 separating populations (for categorical columns )
* 3 using pandas to model time series
    * 3.1 indexing time series
    * 3.2 resampling time series data
    * 3.3 manipulating time series data
    * 3.4 time series visualization


# 0 pandas basics

In [None]:
# don't run the cells

# indexes and columns
type(df)

df.shape # access the shape attribute

df.columns

type(df.columns)

df.index

type(df.index)

In [None]:
# slicing
df.loc[rows, cols] # slicing by label index  

df.iloc[rows, cols] # slicing by position index

In [None]:
# looking at the data frame 
df.head()  

df.tail()

df.info()

# 1 data ingestion & inspection  

data import & export in various format

## 1.1 building dataframes from scratch

In [None]:
# dataframes from csv files
pd.read_csv(filepath, index_col = 0)

In [None]:
# dataframe from dict (1)
dict = {'colA':[lst_of_vals], 
        'colB':[lst_of_vals],
        'colC':[lst_of_vals]}

pd.DataFrame(dict)

In [None]:
# dataframes from dict(2)
colA = [lst_of_vals]
colB = [lst_of_vals]
colC = [lst_of_vals]

names = ['nameA', 'nameB', 'nameC']
cols = [colA, colB, colC] # a list of lists, containing values of a df

zipped = list(zip(names, cols)) # return a list of tuples

data = dict(zipped) # convert the list of tuples to dict

pd.DataFrame(dict)

In [None]:
# broadcasting(1)
df['someCol'] = 0 # assign 0 to the entire column

In [None]:
# broadcasting(2)
data = {'nameA':[lst_of_vals],
        'nameB':a_single_val}

pd.DataFrame(data)

In [None]:
# set index and columns 
df.columns = [lst_of_col_names_you_want]
df.index = [lst_of_inx_names_you_want]

## 1.2 importing & exporting data

In [None]:
# datasets from csv

col_names = [lst_of_col_names_you_want]

pd.read_csv(filepath, 
            header = None # tell pandas there's no header line in source file 
            name = col_names # tell pandas to use col_names you desire
            na_values = {'colA':['-1']} # tell pandas the '-1' in column A is NaN value
            parse_dates = [[0, 1, 2]] # tell pd the value in column [0,1,2] should be concatenated and treated as date
            parse_dates = True # or to parse the col containing complete date string)

# inspecting dataframe
df.info()

# using dates as index
df.index = df['dateCol']


In [None]:
# exporting data 

df.to_csv('output.csv') # to .csv

df.to_csv('output.tsv', sep = '\t') # to .tsv

df.to_excel('output.xlsx')

In [None]:
# dropping columns / trimming off redundant columns 
cols_to_keep = [colA, colB]

df = df[cols_to_keep]

## 1.3 plotting with pandas

In [None]:
import pandas as pd

import matplotlib.pyplot as plt

# plotting numpy arrays (matplotlib)
plt.plot(some_array)
plt.show()

In [None]:
# plotting pandas Series
plt.plot(some_series) # using matplotlib
plt.show()

some_series.plot() # using pandas, plot Series derectly
plt.show()

In [None]:
# plotting pandas DataFrames
plt.plot(df)  # using matplotlib, plot all columns at once 
plt.show()


df.plot()  # using pandas, plot all Series at once
plt.show()

# fixing scales
df.plot()
plt.yscale('log')
plt.show()

In [None]:
# customizing plots
some_series.plot(color = 'b',
                 style = '.-', 
                 legend = True)

plt.axis(start_of_x, end_of_x, start_of_y, end_of_y)

plt.show()


In [None]:
# saving plots
plt.savefig('picname.png')

plt.savefig('picname.jpg')

plt.savefig('picname.pdf')

# 2 exploratory data analysis

statistical & graphical methods

## 2.1 visual explotatory data analysis (numerical vals)

In [None]:
df = pd.read_csv(filepath, index_col = 0)

df.shape

df.head()

df.plot(x = col_as_x, y = col_as_y, 
        kind = 'scatter' | kind = 'box' | kind = 'hist')

plt.xlabel(label_on_x_axis)
plt.ylabel(label_on_y_axis)

plt.show()

In [None]:
# when kind = 'hist', there's some more options

df1.plot(x = col_as_x, y = col_as_y, 
        kind = 'hist',
        bins = 30, # number of interval or bins
        range = (4, 8), # extrema of bins (min, max) 
        normed = True, # whether to normalize frequency to 1)

df2.plot(x = col_as_x, y = col_as_y, 
        kind = 'hist',
        bins = 30, # number of interval or bins
        range = (4, 8), # extrema of bins (min, max) 
        normed = True, # whether to normalize frequency to 1, 
        cumulative = True # Cumulative Distribution Function (CDF))

In [None]:
# different DataFrame plot idoms
df.plot(kind = 'hist')

df.plt.hist()

df.hist()

## 2.2 statistical exploratory data analysis (for numerical columns )

In [None]:
# summarizing with describe()
df.describe() # get count, mean,std, min, 25%, 50%, 75%, max

In [None]:
# count
df[colA].count() # apply to Series, get an int 
df[colB].count()

df[[colA, colB]].count() # apply to DataFrame, get a Series

In [None]:
# averages 
df[colA].mean() # apply to Series

df.mean() # apply to entire DataFrame

In [None]:
# standard deviations 
df.std()

In [None]:
# median & 0.5 quantile
df.meidan()

q = 0.5
df.quantile(q) # gets the same thing

In [None]:
# inter-quartile range(IQR)
q = [0.25, 0.75]

df.quantile(q) # get a DataFrame

In [None]:
# ranges
df.min()

df.max()

In [None]:
# box plots
df.plot(kind = 'box') # plotting all columns 

plt.show()

## 2.3 separating populations (for categorical columns )

In [None]:
# describe categorical column
df[cate_col].describe() 
    # count: non-null entries
    # unique: distinct values
    # top: most frequent category
    # freq: occurrency of the top

In [None]:
# unique factors
df[cate_col].unique()

In [None]:
# filtering by species
indices = df[cate_col] == 'typeA'

typeA = df.loc[indices, :]  # extracting new DataFrame

typeA[cate_col].unique() # expect only one value

typeB = df.loc[df[cate_col] == 'typeB', :]
typeC = df.loc[df[cate_col] == 'typeC', :]

In [None]:
# plot numerical columns seperated by categorical values
typeA.plot(kind = 'hist', bins = 50, range = (0, 8), alpha = 0.3)

typeB.plot(kind = 'hist', bins = 50, range = (0, 8), alpha = 0.3)

typeC.plot(kind = 'hist', bins = 50, range = (0, 8), alpha = 0.3)

    # get one plot each category

# 3 using pandas to model time series

time indexes, resampling

## 3.1 indexing time series

In [None]:
# use pandas to read datetime objects
df = pd.read_csv(filepath, 
            parse_dates = True, # read string s into datetime obj
            index_col = 'date' # specify the column name in the souce df
           )

In [None]:
# Partial datetime string selection
df.loc['2015-02-19 11:00:00', 'colA'] # selecting single cell

# selecting the whole day
sales.loc['2015-2-5']

sales.loc['2015-2'] # Whole month
sales.loc['2015'] # Whole year

In [None]:
# Slicing using dates/times
sales.loc['2015-2-16':'2015-2-20']

In [None]:
# Convert strings to datetime
evening_2_11 = pd.to_datetime(['2015-2-11 20:00', 
                               '2015-2-11 21:00', 
                               '2015-2-11 22:00', 
                               '2015-2-11 23:00'])

sales.reindex(evening_2_11) # replace index with the above lst_of_date_obj

In [None]:
# Filling missing values

# reindex with [dateA, dateB, dateC, dateD],
# it's like rearranging data, not literally changing their index.
# say when the source don't have dateB, dateC row
# the new df will have NaN value in dateB, dateC row

sales.reindex(evening_2_11, method='ffill') # 向前看齐

sales.reindex(evening_2_11, method='bfill') # 向后看齐

## 3.2 resampling time series data

In [None]:
sales = pd.read_csv('sales-feb-2015.csv',
                    parse_dates=True,
                    index_col= 'Date')

In [4]:
# resampling: 调整看数据的尺度
    # down-sampling：退后几步，从更大的时间单位看
    # up-sampling：走朝前，从更精细的时间单位看
    
# resampling together used with aggregation functions
    # mean(), sum(), median(), count(), etc.

In [None]:
#  Aggregating 
daily_mean = sales.resample('D').mean()

sales.resample('D').sum()

sales.resample('W').count()

In [None]:
# Method chaining
sales.resample('D').sum().max()

resampling frequencies  
![](http://note.youdao.com/yws/public/resource/5875cbf527e0463ea3b90059c233b97c/xmlnote/WEBRESOURCE51e6e58394e4d043093e6fea984ccf3a/30104)

In [None]:
# multiplying frequencies
sales.loc[:,'Units'].resample('2W').sum() # 每两周为一个group

In [None]:
# upsampling and filling 
two_days.resample('4H').ffill() # 每4小时为一个group

## 3.3 manipulating time series data

In [None]:
sales = pd.read_csv(filepath, 
                    parse_dates = ['Date'] # specify the column containing date str
                   )

In [None]:
# string methods

sales[colA].str.upper() # turn vals in colA into upper case

sales[colB].str.contains(some_word) # get a Series of boolean

sales[colB].str.contains(some_word).sum() # the sum of 'True'

In [None]:
# Datetime methods

sales['Date'].dt.hour # extract hour in datetime obj

central = sales['Date'].dt.tz_localize('US/Central') # set timezone 
central.dt.tz_convert('US/Eastern')  # convert timezone

sales['Dates'].dt.tz_localize('US/Central').dt.tz_convert('US/Eastern') # method chaining 

In [None]:
# interpolate missing data

    # this data set contains world population every 10 years
population = pd.read_csv('world_population.csv',
                         parse_dates=True, 
                         index_col= 'Date')

    # rather than ffill or bfill, 
    # tell pandas to generate a linear model to fit missing values 
population.resample('A').first.interpolate('linear')

## 3.4 time series visualization

* line types 
* plot types 
* subplots

In [None]:
sp500.loc['2012-4', 'Close'].plot(style='k.-',
                                  title='S&P500')

* line types options of pd.Series.plot()

![](http://note.youdao.com/yws/public/resource/5875cbf527e0463ea3b90059c233b97c/xmlnote/WEBRESOURCEb1287b52558aa1bc40d6ee62091a794d/30108)

In [None]:
# kind option, kind = 'area'
sp500['Close'].plot(kind='area', title='S&P 500')

In [None]:
# plot multiple columns 
sp500.loc['2012', ['Close','Volume']].plot() # two cols in one plot

sp500.loc['2012', ['Close','Volume']].plot(subplots=True) # two cols in separate plots
                                           