In [None]:
# check pandas version
import pandas
pandas.__version__

In [None]:
# alias pd
import pandas as pd


### Pandas data objects: Series, DataFrame, and Index.
Numpy rows and columns are identified by integer indices; Pandas uses labels

#### Series

In [None]:
# Series as one-dimensional indexed array
s1 = pd.Series([5, 3, 1])
s1

In [None]:
print(s1.index)
s1.values

In [None]:
s1[1] # no different than Numpy array with the usual index

In [None]:
s2 = pd.Series([5, 3, 1], index = ['first', "second", 'third'])
s2

In [None]:
s2['second'] # accessing like in a dictionary

In [None]:
s2["first":"third"]

In [None]:
# Series from a dictionary
s3 = pd.Series({"first":1, 2:"second"})
s3

#### DataFrame

In [None]:
# A Series is like a one-dimensional array with flexible indices, 
# A DataFrame is like a two-dimensional array with  flexible row indices and column names.

s4 = pd.Series([7,1,5,3], index = ['person1', 'person2', 'person3', 'person4'])
s5 = pd.Series([9,4,11,8], index = ['person1', 'person2', 'person3', 'person4'])
df1 =  pd.DataFrame({'Age': s4, 'Weight': s5}) # a dataFrame as a dictionary of Series structures having same index
df1

In [None]:
print(df1.index)
print(df1.columns)
df1['Age']

### Data Selection: loc and iloc

In [None]:
# Series
list(s2.items())

In [None]:
s6 = pd.Series(['a', 'b', 'c'], index=[2, 3, 4])
s6

In [None]:
s6[2] # explicit index of the series

In [None]:
# slicing uses the implicit index
s6[2:]


In [None]:
s6.loc[2:] # slicing with loc uses the explicit index

In [None]:
s6.loc[2]

In [None]:
# iloc for the implicit index
s6.iloc[2]
type(s6.iloc[2])

In [None]:
s6.iloc[2:]
type(s6.iloc[2:])

In [None]:
df1.Age # same as df1["Age]

In [None]:
print(df1.values)
df1.keys

In [None]:
df1.T # Transpose

In [None]:
# You get a row when you pass  a single index to an array 
df1.values[1]

In [None]:
# You get a column if you pass a single label to a DataFrame 
df1["Age"]

In [None]:
# implit index with iloc
df1.iloc[:2,:1 ]

In [None]:
# explicit index and column names with loc
df1.loc[:'person2', :'Weight'] # unlike the implicit, the last is included

In [None]:
# We have seen the indexing is used for columns, e.g., df1['Age']. However, slicing is for rows
df1["person1":'person3']

In [None]:
df1[0:3] # using implicit row numbers

In [None]:
# Masking refers to rows
df1[df1.Age > 1]

### Missing Values

In [None]:
# Python has None object:
import numpy as np
R1 = np.array([4, None, 5])
R1 # An array has only one data type. Here dtype=object is the common type

In [None]:
#  NaN ( Not a Number) is a special floating-point value by the standard IEEE 
R2 = np.array([4, np.nan, 5])
print(R2)
R2.dtype

In [None]:
print(5 + np.nan)
print(5*np.nan)
print(R2.sum())
np.nansum(R2)  # ignore NaN

In [None]:
# Pandas
s7 = pd.Series([5, np.nan, 7, None]) #  upcast to a floating-point type to consider the NA
s7

In [None]:
s7.isnull() 

In [None]:
s7[s7.notnull()]

In [None]:
# dropping NAs
s7.dropna()

In [None]:
# NAs in data frames
df2 = pd.DataFrame([[5, 7, 2],
                   [9,  np.nan,  1],
                   [3,      8, np.nan]])
df2

In [None]:
# You can drop rows or columns, not single values
df2.dropna() # default is row-wise

In [None]:
df2.dropna(axis =1) #column-wise. 
#Also, how = 'any' is default. how = 'all' drops when all elements of a column are null

In [None]:
# Filling NAs
df2.fillna(0)

In [None]:
print(df2.fillna(method='ffill')) # forward 
print(df2.fillna(method='bfill', axis =1)) #backward
df2

### Aggregation

In [None]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})
    

In [None]:
df

In [None]:
grouped = df.groupby('A')

grouped

In [None]:
grouped.sum() # df.groupby('A').sum()

In [None]:
df.groupby('A').get_group('foo')

In [None]:
grouped.agg(np.sum)

In [None]:
grouped.size()

In [None]:
grouped.describe()
#pandas.DataFrame.describe?

In [None]:
# Multiple functions
grouped['C'].agg([np.sum, np.mean, np.std])

### Reading Data Files into Pandas
Reference: (http://nbviewer.jupyter.org/github/justmarkham/pandas-videos/blob/master/pandas.ipynb)

In [None]:
# tab-seperated data into pandas data frame. the data is Chipotle orders
orders = pd.read_table('http://bit.ly/chiporders')

In [None]:
orders.head()  # head is a method

In [None]:
pd.read_table?

In [None]:
# Let us read data on movie users. It is seperated by a pipe
users = pd.read_table('http://bit.ly/movieusers')

In [None]:
users.head()

In [None]:
# The first row is read as column names
users = pd.read_table('http://bit.ly/movieusers', sep='|')
users.head()

In [None]:
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None)
users.head()

In [None]:
user_cols = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_table('http://bit.ly/movieusers', sep='|', header=None, names=user_cols)
users.head()

In [None]:
users['occupation']

In [None]:
users.occupation

In [None]:
users.shape # attributes have no parenthesis

In [None]:
users.dtypes

In [None]:
users.describe()

In [None]:
users.describe(include='all')

In [None]:
# UFO reports data seperated by comma
ufo = pd.read_table('http://bit.ly/uforeports', sep=',')

# read_csv has default comma as seperator
ufo = pd.read_csv('http://bit.ly/uforeports')

In [None]:
# ufo.Location = ufo.City + ', ' + ufo.State will not work

# You have to use bracket notation to define the Series name)
ufo['Location'] = ufo.City + ', ' + ufo.State
ufo.shape

In [None]:
ufo.State.value_counts()

In [None]:
ufo["Shape Reported"].unique()

In [None]:
ufo["Shape Reported"].value_counts()