# Series
ðŸŽ¯A one-dimensional array-like object containing a sequnce of values and associated array of data labels, called its index

In [362]:
import pandas as pd

In [363]:
obj = pd.Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [364]:
# Get the array representation and index object of the Series
obj.values

array([ 4,  7, -5,  3])

In [365]:
obj.index

RangeIndex(start=0, stop=4, step=1)

In [366]:
# Customize labels - index parameter
obj2 = pd.Series([4,7,-5,3], index = ['d','b','a','c'])
obj2

d    4
b    7
a   -5
c    3
dtype: int64

In [367]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [368]:
# Use labels in the indexing to get the single value or set of values
print(obj2['a'])

-5


In [369]:
print(obj2['d'])

4


In [370]:
print(obj2[['c','a','d']])

c    3
a   -5
d    4
dtype: int64


In [371]:
# Filtering with a boolean array
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

In [372]:
# Scalar multiplication
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

In [373]:
import numpy as np
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [374]:
# Mapping of indexing values to data values
'b' in obj2

True

In [375]:
'e' in obj2

False

In [376]:
# Convert Python dicttionary to Series
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [377]:
obj3 = pd.Series(sdata)

In [378]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [379]:
# Overide the dict keys in the order we want then to appear
states = ['California','Ohio','Oregon','Texas']
obj4 = pd.Series(sdata, index = states)

In [380]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [381]:
# Missing or NA values detect
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [382]:
# Or
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [383]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [384]:
# Or
obj4.notnull()

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [385]:
""" A useful Series feature for many applications is that it automatically
aligns by index label in arithmetic operations."""

obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [386]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [387]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [388]:
# Both the Series object itself and its index have a name attribute

obj4.name = 'population'

In [389]:
obj4.index.name = 'state'

In [390]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

In [391]:
obj4.name

'population'

In [392]:
obj4.index.name

'state'

# DataFrame
ðŸŽ¯ðŸŽ¯A DataFrame represents a rectangular table of data and contains an ordered colection of columns, each of which can be a diiferent value type. It has both row and column index

In [393]:
# Construct a DataFrame from a dict of equal-length lists or NumPy arrays (common)
data = {'state': ['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],
        'year': [2000,2001,2002,2001,2002,2003],
        'pop': [1.5, 1.7,3.6,2.4,2.9,3.2]
       }

frame = pd.DataFrame(data)

In [394]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [395]:
# For learge DataFrames, the head() method selects only the first five rows
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [396]:
# We can pass how many rows we want as parameters
frame.head(3)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6


In [397]:
# For learge DataFrames, the tail() method selects only the last five rows
frame.tail()

Unnamed: 0,state,year,pop
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [398]:
# We can pass how many rows we want as parameters
frame.tail(3)

Unnamed: 0,state,year,pop
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [399]:
# Manage sequence of columns according to custom order - using columns parameter
pd.DataFrame(data, columns = ['year','state','pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [400]:
# If we pass a column that isn't contained in the dict, it will appear with missing values
frame2 = pd.DataFrame(data, columns = ['year','state','pop','debt'], index = ['one','two','three','four','five','six'])

In [401]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [402]:
# Columns in the DataFrame
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [403]:
# Shape of the DataFrame
frame2.shape

(6, 4)

In [404]:
# Information about the DataFrame
frame2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, one to six
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    6 non-null      int64  
 1   state   6 non-null      object 
 2   pop     6 non-null      float64
 3   debt    0 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 240.0+ bytes


In [405]:
# Statistical summary
frame2.describe()

Unnamed: 0,year,pop
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [406]:
# Retrieved a column as a series - dict-like notation
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [407]:
# Accessing like object atribute
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [408]:
# Rows can also be retrieved by position or name with the special loc attribute
frame2.loc['one']

year     2000
state    Ohio
pop       1.5
debt      NaN
Name: one, dtype: object

In [409]:
# Columns can be modified by assignment
frame2['debt'] = 16.5

In [410]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [411]:
""" When assigning lists or assarys to a column, the values length must match the length of the DataFrame."""
frame2['debt'] = np.arange(6.)

In [412]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [413]:
# Assign valus to specific rows - using Series
values = pd.Series([-1.2,-1.5,-1.7], index = ['two','four','five'])

In [414]:
# If you assign a Series, its labels will be realigned exactly to the DataFrame's index. Inserting missing values in any holes
frame2['debt'] = values

In [415]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [416]:
# Creatin a new column
frame2['eastern'] = frame2.state == 'Ohio'

In [417]:
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


In [418]:
# Delete method can be used to remove column
del frame2['eastern']

In [419]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [420]:
# DataFrame from dict of dicts
pop = {'Nevada':{2001:2.4, 2002:2.9}, 'Ohio':{2000: 1.5, 2001: 1.7, 2002: 3.6}}

In [421]:
# If the nested dict is passed to the DataFrame, pandas will interpret the outer dict kes as the columns and the inner keys as the row indices
frame3 = pd.DataFrame(pop)

In [422]:
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [423]:
# DataFrame can be transposed like NumPy array
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [424]:
pd.DataFrame(pop, index = [2000, 2001, 2002, 2003])

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [425]:
frame3.index

Index([2001, 2002, 2000], dtype='int64')

In [426]:
frame3.values
frame3

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [427]:
pdata = {'Ohio': frame3['Ohio'][:-1],
        'Nevada': frame3['Nevada'][:2]}

In [428]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [429]:
# Set the DataFrame name, column name, and index name
frame3.name = 'population'
frame3.index.name = 'yaer'
frame3.columns.name = 'state'

In [430]:
frame3.name

'population'

In [431]:
frame3.index.name

'yaer'

In [432]:
frame3.columns.name

'state'

In [433]:
frame3

state,Nevada,Ohio
yaer,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [434]:
# As with Series, the values attribute returns the data conatined in the DataFrame as a two-dimensional ndarray
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [435]:
# If the DataFrame's columns are different dtypes, the dtype of the values array will be chosen to accommodate all of the the columns
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

# Index Objects
ðŸŽ¯Pandas's Index objects are responsible for holding the labels and other metadeta (like the axis name or names)

In [436]:
obj = pd.Series(range(3), index = ['a','b','c'])

In [437]:
index = obj.index

In [438]:
index

Index(['a', 'b', 'c'], dtype='object')

In [439]:
index[1:]

Index(['b', 'c'], dtype='object')

In [440]:
# Index objects are immutable and thus can not be modified by the user
index[1] = 'd'

TypeError: Index does not support mutable operations

In [None]:
# Immutability makes it safer to share Index objects among data structure
labels = pd.Index(np.arange(3))

In [None]:
labels

In [None]:
obj2 = pd.Series([1.5, -2.5, 0], index = labels)

In [None]:
obj2

In [None]:
obj2.index is labels

In [None]:
idx = pd.Index(['a','b','c'])

In [None]:
s1 = pd.Series([1,2,3], index = idx)

In [None]:
s2 = pd.Series([4,5,6], index = idx)

In [None]:
# Both s1 and s share the same index object
s1.index is s2.index

In [None]:
# An Index also behaves like a fixed-size set
frame3

In [None]:
frame3.columns

In [None]:
# Check if a column exists in the columns
'Nevada' in frame3.columns

In [None]:
# Check if an index exists in the row labels/indices
2003 in frame3.index

In [None]:
# Unlike Python sets, a pandas Index can contain duplicate labels
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])

In [None]:
dup_labels

In [None]:
# Selection with duplicate labels will select all the occurrence of that label
s = pd.Series([1,2,3,4], index = dup_labels)

In [None]:
s.loc['foo']

# Essential Functionality


### Reindexing
ðŸŽ¯An important method on pandas objects is reindex, which means to create a new object with the data confirmed to a new index

In [None]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index = ['d', 'b','a', 'c'])

In [None]:
obj

In [None]:
# Calling reindex on this Series rearranges the data according to the new index, introducing missing values if any values were not already present
obj2 = obj.reindex(['a','b','c','d','e'])

In [None]:
obj2

In [None]:
# We can use the method parameter to fill the missing values automatically
obj3 = pd.Series(['blue','purple','yellow'], index = [0,2,4])

In [None]:
obj3

In [None]:
# method parameter can only be used when the index is ordered - meaning the index must have a clear order so Pandas knows what 'forward/backward' means
obj3.reindex(range(6), method = 'ffill')

In [None]:
obj3.reindex(range(5), method = 'bfill')

In [None]:
# With DataFrame, reindex can alter the (row) index, columns, or both. When passed only a sequence, it reindexes the rows in the result
frame = pd.DataFrame(np.arange(9).reshape(3,3), index = ['a','c','d'], columns = ['Ohio','Texas','California'])

In [None]:
frame

In [None]:
frame2 = frame.reindex(['a','b','c','d'])

In [None]:
frame2

In [None]:
# The columns can be reindexed with the columns keyword
states = ['Texas', 'Utah', 'California']

In [None]:
frame.reindex(columns = states)

### Dropping Entries from an Axis
ðŸŽ¯The drop method is used to drop entries. It will return a new object with theindicated value or values deleted from an axis

In [None]:
obj = pd.Series(np.arange(5.), index = ['a','b','c','d','e'])

In [None]:
obj

In [None]:
new_obj = obj.drop('c')

In [None]:
new_obj

In [None]:
obj.drop(['d','c'])

In [None]:
# With DataFrame, index values can be deleted from either axis
data = pd.DataFrame(np.arange(16).reshape(4,4), index = ['Ohio','Colorado','Utah','New York'], columns = ['one','two','three','four'])

In [None]:
data

In [None]:
# Calling drop with a sequence of labels will drop values from the row labels (axis 0)
data.drop(['Colorado','Ohio'])

In [None]:
data.drop(['Colorado','Ohio'], axis = 0)

In [None]:
# We acn drop values from columns by passing axis = 1 or axis = 'columns'
data.drop('two', axis = 1)

In [None]:
data.drop(['two','four'], axis = 'columns')

In [None]:
# Many functions, like drop, which modify the size or shape of a Series or DataFrame, can manipulate an object in-place without returning a new object
obj.drop('e', inplace = True)

In [None]:
obj

# Indexing, Selection, and Filtering

In [None]:
# Series ndexing works analogously to NumPy array indexing
obj = pd.Series(np.arange(4.), index = ['a','b','c','d'])

In [None]:
obj

In [None]:
print(obj['b'])

In [None]:
obj[['a','c','d']]

In [None]:
obj[obj > 2]

In [None]:
# Slicing with labels behaves differently than normal Python slicing in that the end-point is inclusive
obj['b':'c']

In [None]:
# Setting using these methods modifies the corresponding section of the series
obj['b':'c'] = 5

In [None]:
obj

In [None]:
# Indexing into a DataFrame is for retrieving one or more columns either with a single value or sequence
data = pd.DataFrame(np.arange(16).reshape(4,4), index = ['Ohio','Colorado','Utah','New York'], columns = ['one','two','three','four'])

In [None]:
data

In [None]:
data['one']

In [None]:
data[['one','three','four']]

In [None]:
# The row selection with slicing
data[:2]

In [None]:
# Indexing with a boolean data
data[data['three'] >5]

In [None]:
# Indexing with a boolean DataFrame
data < 5

In [None]:
data[data < 5] = 0

In [None]:
data

### Selection with loc and iloc
ðŸŽ¯For DataFrame label-indexing on the rows, introduced special indexing operators loc and iloc. they enable to select a subset of the rows and columns from a DataFrame

In [None]:
data

In [None]:
data.loc[['Ohio']]

In [None]:
data.loc[['Colorado','New York']]

In [None]:
data.loc[['New York'], ['one']]

In [None]:
data.loc[['Ohio','Colorado','Utah'],['one','three','four']]

In [None]:
data.loc[['Ohio'],['one','three','four']]


In [None]:
data.iloc[0]

In [None]:
data.iloc[[0]]

In [None]:
data.iloc[[0,1,2],[1]]

In [None]:
data.iloc[[1,2,3],[0,1,2]]

In [None]:
data.iloc[:,2]

### Integer Indexs

In [None]:
ser = pd.Series(np.arange(3.))

In [None]:
ser[-1]

In [None]:
ser2 = pd.Series(np.arange(3.), index = ['a','b','c'])

In [None]:
print(ser2[-1])

In [None]:
ser[:1]

In [None]:
ser.loc[:1]

In [None]:
ser.iloc[:1]

### Arithmetic and Data Alignment

In [None]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a','b','c','d'])

In [None]:
s1

In [None]:
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1], index = ['a','c','e','f','g'])

In [None]:
s2

In [None]:
""" The internal data alignment introduces missing values in the label locations that do not overlap. Missing values will
then propagate in further arithmetic computations. """
# Adding these together yields:
s1 + s2

In [None]:
# In this case of DataFrame, alignment is performed on both the rows and the columns
df2 = pd.DataFrame(np.arange(9.).reshape((3,3)), columns = list('bcd'), index = ['Ohio','Texas','Colorado'])

In [None]:
df3 = pd.DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'), index = ['Utah','Ohio','Texas','Oregon'])

In [None]:
df2

In [None]:
df3

In [None]:
# Since the 'c' and 'e' columns are not found in both DataFrame objects, they appear as all missing in the result
df2 + df3

In [None]:
# If we add DataFrame objects with no column or row labels in common, the result will contain all nulls
df1 = pd.DataFrame({'A':[1,2]})

In [None]:
df2 = pd.DataFrame({'B':[3,4]})

In [None]:
df1

In [None]:
df2

In [None]:
df1 - df2

### Arithmetic Methods with Fill Values

In [None]:
df1 = pd.DataFrame(np.arange(12.).reshape((3,4)), columns = list('abcd'))

In [None]:
df2 = pd.DataFrame(np.arange(20.).reshape((4,5)), columns = list('abcde'))

In [None]:
df1

In [None]:
df2.loc[1,'b'] = np.nan

In [None]:
df2

In [None]:
# Adding these together results in NA values in the locations that do not overlap
df1 + df2

In [None]:
# Using the add() method on df1, we passed df2 and an argument to fill_value
df1.add(df2, fill_value = 0)

In [None]:
df1.sub(df2, fill_value = 0)

In [None]:
df1.mul(df2, fill_value = 0)

In [None]:
df1.pow(df2, fill_value = 0)

In [None]:
# Relatedly, when reindexing a Series or DataFrame, you can also specify a different fill value
df1.reindex(columns = df2.columns, fill_value = 0)

### Operations Between DataFrame and Series

In [None]:
""" As with NumPy arrays of different dimensions, arithmetic between DataFrame and Series is also deined - broadcasting """
arr = np.arange(12.).reshape((3,4))

In [None]:
arr

In [None]:
arr[0]

In [None]:
arr.shape

In [None]:
arr[0].shape

In [None]:
arr - arr[0]

In [None]:
# We can do the same with a DataFrame and a Series
frame = pd.DataFrame(np.arange(12.).reshape((4,3)), columns = list('bde'), index = ['Utah','Ohio','Texas','Oregon'])

In [None]:
series = pd.Series(frame.iloc[0])

In [None]:
frame

In [None]:
series

In [None]:
series.shape

In [None]:
# By default DataFrame and Series matches the index of the Series on the DataFrame's columns, broadcasting down the rows
frame - series

In [None]:
# If an index value is not found in either the DataFrame's columns or the Series's index, the objects will be reindex to form the union
series2 = pd.Series(range(3), index = ['b','e','f'])

In [None]:
series2

In [None]:
frame + series2

In [None]:
# If we want to instead broadcast over the columns, matching on the rows, we have to use one of the arithmetic methods
series3 = frame['d']

In [None]:
series3

In [None]:
frame.sub(series3, axis = 'index')

### Function Application and Mapping

In [None]:
# Numpy ufuncs (element-wise array methods) also work with pandas
frame = pd.DataFrame(np.random.randn(4,3), columns = list('bde'), index = ['Utah','Ohio','texas','Oregon'])

In [None]:
frame

In [None]:
np.abs(frame)

In [None]:
# Another frequent operation is applying a function on one-dimensional arrays to each column or row, DataFrame's apply method does exactly this
f = lambda x : x.max() - x.min()

In [None]:
frame.apply(f)

In [None]:
frame

In [None]:
# If we pass axis = 'columns' to apply, the function will be invoked once per row instead
frame.apply(f, axis = 'columns')

In [None]:
def f(x):
    return pd.Series([x.max(), x.min()], index = ['max', 'min'])
    

In [None]:
frame.apply(f)

In [None]:
format = lambda x : '%.2f' % x

In [None]:
frame['e'].map(format)

### Sorting and Ranking

In [None]:
# Sorting a dataset by some criterion is another important built-in operation. To sort lexicographically by row or column index, use the sort_index method
obj = pd.Series(range(4), index = list('dabc'))

In [None]:
obj

In [None]:
# Returns a new, sorted object
obj.sort_index()

In [None]:
obj

In [None]:
# With a DataFrame, we can sort by index on either axis
frame = pd.DataFrame(np.arange(8).reshape((2,4)), index = ['three','one'], columns = list('dabc'))

In [None]:
# By default axis = 0 (vertically)
frame.sort_index()

In [None]:
# Along columns
frame.sort_index(axis = 'columns')

In [None]:
# Descending order
frame.sort_index(axis = 1, ascending = False)

In [None]:
# To sort a Series by its valus, use its sort_values method
obj = pd.Series([4,7,-3,2])

In [None]:
obj.sort_values()

In [None]:
# Any missing values are sorted to the end of the Series by default
obj = pd.Series([4, np.nan, 7, np.nan])

In [None]:
obj.sort_values()

In [None]:
# When sorting a DataFrame, we can use the data in one or more columns as the sort keys. To do so, pass one or more column names to the 'by' option of sort_values
frame = pd.DataFrame({'b':[4,7,-3,2],'a':[0,1,0,1]})

In [None]:
frame

In [None]:
frame.sort_values(by = 'b')

In [None]:
frame.sort_values(by = ['a','b'])

In [None]:
# Ranking assigns ranks from one through the number of valid data points in an array.By defualt rank breaks ties by assigning each group the mean rank
obj = pd.Series([7,-5,7,4,2,0,4])

In [None]:
obj.rank()

In [None]:
# Ranks can aslo be assigned according to the order in which they're observed in the data
obj.rank(method = 'first')

In [None]:
# Rank in descendding order
obj.rank(ascending = False, method = 'max') # Assing tie values the maximum rank in the group

In [None]:
# DataFrame can compute ranks over the rows or thee columns
frame = pd.DataFrame({'b':[4.3,7,-3,2],'a':[0,1,0,1],'c':[-2,5,8,-2.4]})

In [None]:
frame

In [None]:
frame.rank(axis = 'columns')

In [None]:
frame.rank()

In [None]:
obj.rank(method = 'max')

In [None]:
obj.rank(method = 'min')

In [None]:
obj.rank(method = 'dense')

### Axis Indexes with Duplicate Labels

In [None]:
obj = pd.Series(range(5), index = list('aabbc'))

In [None]:
obj

In [None]:
# The indexs's is_unique property can tell whether its labels are unique or not
obj.index.is_unique

In [None]:
# Indexing a label with multiple entris returns a Series, while single entries return scalar value
obj['a']

In [None]:
obj['c']

In [None]:
df = pd.DataFrame(np.random.randn(4,3), index = list('aabb'))

In [None]:
df

In [None]:
df.loc['a']

In [None]:
df.loc['b']

## Summarizing and Computing Descriptive Statistics
ðŸ”¢Pandas objects are equiped with a set of common mathematical and statistical  methods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like sum or mean) from a Series or a Series of values from rows or columns of a DataFrame

In [None]:
df = pd.DataFrame([[1.4, np.nan],[7.1, -4.5],[np.nan, np.nan], [0.75, -1.3]], index = list('abcd'), columns = ['one', 'two'])

In [None]:
df

In [None]:
df.shape

In [None]:
# Calling DataFrame's sum method returns a Series conatining columns sums
df.sum()

In [None]:
# Along the columns
df.sum(axis = 'columns')

In [None]:
df.mean()

In [None]:
df.mean(axis = 1)

In [None]:
# Disable skipna option
df.mean(skipna = False)

In [None]:
df.mean(axis = 1, skipna = False)

In [None]:
# Some methods, like idxmax and idxmin, return indirect statistics like the index value where the maximum or minimum values are attainde
df.idxmax()

In [None]:
df.idxmin()

In [None]:
# Cumulative sum
df.cumsum()

In [None]:
df.cumsum(axis = 1)

In [None]:
""" Another type of method is neither a reduction nor an accumulation. Descire is one such example, producing multiple summary statistics in one shot. """
df.describe()

In [None]:
# On non-numeric data, describe produces alternative summary statistics
obj = pd.Series(['a','a','b','c'] * 4)

In [None]:
obj

In [None]:
obj.describe()

![{688C85A1-021E-4EBF-96AC-3D8877B03EE2}.png](attachment:ca2ada2e-d3f5-4a11-a589-245fbc5f2d4b.png)

In [None]:
df

In [None]:
# Count number of non-NaN values
df.count()

In [None]:
df.min()

In [None]:
df.max()

In [None]:
# Returns index position
obj.argmax()

In [None]:
obj.argmin()

In [None]:
# Compute sample quantile ranging from 0 to 1
df.quantile()

In [None]:
# Arithmetic median (50% quantile) of values
df.median()

In [None]:
# Product of all values
df.prod()

In [None]:
# Sample variance of values
df.var()

In [None]:
# Sample standard deviation of values
df.std()

In [None]:
# Sample skewness (third moment) of values
df.skew()

In [None]:
# Sample kurtosis (fourth moment) of values
df.kurt()

In [None]:
df.cummax()

In [None]:
df.cummin()

In [None]:
df.cumprod()

In [None]:
df.diff()

### Unique Values, Value Counts, and Membership

In [None]:
obj = pd.Series(['c','a','d','a','a','b','b','c','c'])

In [None]:
uniques = obj.unique()

In [None]:
uniques

In [None]:
counts = obj.value_counts()

In [None]:
counts

In [None]:
mask = obj.isin(['b','c'])

In [None]:
mask

In [None]:
# isin perfoms a vectorized set membership check and can be useful in filtering a dataset down to a subset of values in a Series or column in a DataFrame
obj

In [None]:
obj.isin(['b','c'])

In [None]:
mask = obj.isin(['a','d'])

In [None]:
obj[mask]

In [None]:
# Related to isin is the Index.get_indexer() method, which gives you an index array from an array of possible non-distinct values into another array of distinct values
to_match = pd.Series(['c','a','b','b','c','a'])

In [None]:
to_match

In [None]:
unique_values = pd.Series(['c','b','a'])

In [None]:
# It tells where each element of to_match appears ininside the index(unique_values) 
pd.Index(unique_values).get_indexer(to_match)

In [None]:
# In some cases, you may want to compute a histogram on multiple related columns in a DataFrame
data = pd.DataFrame({'Qu1':[1,3,4,3,4],'Qu2':[2,3,1,2,3],'Qu3':[1,5,2,4,4]})

In [None]:
data

In [None]:
result = data.apply(pd.value_counts).fillna(0)

In [None]:
result

# Data Loading, Storage, and File Formats

## Reading and Writing Data in Text Format
ðŸ“‹Pandas features a number of functions for reading tabular data as a DataFrame object.  
    ðŸ”¹read_csv: load delimited data from a file, URL, or file-like object; use comma as defult  
    ðŸ”¹read_table: load delimited data from a file, URL, or file-like object; use tab('\t') as default  
    ðŸ”¹read_fwf: read data in fixed-width column format (i.e., no delimiters)    
    ðŸ”¹read_clipboard: version of read_table that reads data from the     clipboard; useful for converting tables from web pages   
    ðŸ”¹read_excel  
    ðŸ”¹read_hdf  
    ðŸ”¹read_html  
    ðŸ”¹read_json  
    ðŸ”¹read_msgpack  
    ðŸ”¹raed_pickle  
    ðŸ”¹read_sas  
    ðŸ”¹raed_stata  
    ðŸ”¹raed_sql  
    ðŸ”¹read_feather  

In [None]:
%%writefile test.txt
Hello from Jupyter!


In [None]:
!type test.txt

In [None]:
%%writefile example1.csv
a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [None]:
!type example1.csv

In [None]:
# Read the file (csv)
df = pd.read_csv('example1.csv')

In [None]:
df

In [None]:
# We could also have used read_table and specified the delimiter
pd.read_table('example1.csv', sep = ',')

In [None]:
# A file will not always have header row 
%%writefile ex2.csv
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [None]:
!type ex2.csv

In [None]:
# If we do not mention the header, it will consider the first row as a header row
df1 = pd.read_csv('ex2.csv')

In [None]:
df1

In [None]:
# Instead we can specify the header by ourselves; otherwise pandas will automatically assign default header or  column names with help of header argument
df2 = pd.read_csv('ex2.csv', header = None)

In [None]:
df2

In [None]:
# Set custom column names with help of 'names' argument
df3 = pd.read_csv('ex2.csv', names = ['a','b','c','d','message'])

In [None]:
df3

In [None]:
# If we want a column to be the index of the DataFrame, we can specify that using the 'index_col' argument
df4 = pd.read_csv('ex2.csv', names = ['a','b','c','d','message'], index_col = 'message')

In [None]:
df4

In [None]:
df4.index.name

In [None]:
%%writefile ex3.csv
one,a,1,2
one,b,3,4
one,c,5,6
one,d,7,8
two,a,9,10
two,b,11,12
two,c,13,14
two,d,15,16

In [None]:
!type ex3.csv

In [None]:
# we can also set multiple columns as hierarchical index by passing a list of column
df5 = pd.read_csv('ex3.csv', names = ['key1','key2','value1','value2'])

In [None]:
df5

In [None]:
df6 = pd.read_csv('ex3.csv', names = ['key1','key2','value1','value2'], index_col = ['key1','key2'])

In [None]:
df6

In [None]:
%%writefile ex4.txt
                     A          B          C
 aaa  -0.264438  -1.026059  -0.619500
 ccc  -0.234565  -0.323453  -0.232454
 ddd  -0.564454  -0.982468  1.1089887

In [None]:
!type ex4.txt

In [None]:
# In some cases, a table might not have a fixed delimeter, using whitespace or some other pattern to separate fields
# In that cases, we can pass a regular expression as a delimiter for read_table
# The regular expression can be expressed by r\s+
df7 = pd.read_table('ex4.txt', sep = r'\s+')

In [None]:
df7

In [None]:
# Since the column name is less than the column, the first one would be treated as index
df7.loc[['aaa','ccc']]

In [None]:
%%writefile ex6.csv
# hey!
a,b,c,d,message
# just wanted to make things more difficult for you
# who reads CSV files with computer, anyway?
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [None]:
!type ex6.csv

In [None]:
# If we want we can skip any rows with help of the 'skiprows' argument
result = pd.read_csv('ex6.csv', skiprows = [0,2,3])

In [None]:
result

In [None]:
# Missing data is usually either not present (empty string) or marked by some sentinel value (NA,NULL)
%%writefile ex7.csv
something, a,b,c,d,message
one,1,2,3,4,NA
two,5,,7,8,world
three,9,10,11,12,foo

In [441]:
!type ex7.csv

something, a,b,c,d,message
one,1,2,3,4,NA
two,5,,7,8,world
three,9,10,11,12,foo


In [442]:
result = pd.read_csv('ex7.csv')

In [444]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2.0,3,4,
1,two,5,,7,8,world
2,three,9,10.0,11,12,foo


In [446]:
# We can handle this null value with help of the isnull() function
pd.isnull(result)

Unnamed: 0,something,a,b,c,d,message
0,False,False,False,False,False,True
1,False,False,True,False,False,False
2,False,False,False,False,False,False


In [450]:
# We can also handle missing data during read the file
# The 'na_values' argument let us add our own customize null values indicator
handle_missing_values = pd.read_csv('ex7.csv', na_values = ['NULL'])

In [452]:
handle_missing_values

Unnamed: 0,something,a,b,c,d,message
0,one,1,2.0,3,4,
1,two,5,,7,8,world
2,three,9,10.0,11,12,foo


In [462]:
sentinels = {'message':['foo', 'NA'], 'something':['two']}
result = pd.read_csv('ex7.csv', na_values = sentinels)

In [464]:
result

Unnamed: 0,something,a,b,c,d,message
0,one,1,2.0,3,4,
1,,5,,7,8,world
2,three,9,10.0,11,12,


### Reading Text Files in Pieces

In [466]:
# Before we look at a large file, we make the pandas display settings more compact
pd.options.display.max_rows = 10
pd.options.display.max_columns = 10

In [468]:
# Load the csv file
data = pd.read_csv('large_dataset.csv')

In [474]:
data.shape

(10000, 100)

In [481]:
data.describe()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,...,col_96,col_97,col_98,col_99,col_100
count,10000.0,10000.0,10000.0,10000.0,10000.0,...,10000.0,10000.0,10000.0,10000.0,10000.0
mean,-0.003066,-0.019374,0.007803,-0.002584,-0.006957,...,0.000896,0.005818,-0.004236,0.012603,0.005118
std,0.996832,0.997986,0.999322,0.994109,1.006572,...,1.013223,0.986087,0.990394,1.001291,0.987723
min,-4.324064,-3.855302,-4.116778,-3.872791,-4.312963,...,-3.916507,-3.998375,-3.392818,-3.806094,-3.929995
25%,-0.673834,-0.691867,-0.672927,-0.668325,-0.686692,...,-0.677009,-0.651463,-0.670393,-0.660044,-0.651851
50%,-0.009166,-0.023667,0.00462,-0.008017,0.005648,...,-8.2e-05,0.006971,-0.002568,0.0126,0.005527
75%,0.667,0.665936,0.677233,0.670014,0.677124,...,0.69086,0.676797,0.665221,0.688661,0.683757
max,3.262381,3.790923,3.705894,3.91357,4.085309,...,3.792899,3.946971,4.274556,3.710545,3.652882


In [483]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 100 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   col_1    10000 non-null  float64
 1   col_2    10000 non-null  float64
 2   col_3    10000 non-null  float64
 3   col_4    10000 non-null  float64
 4   col_5    10000 non-null  float64
 5   col_6    10000 non-null  float64
 6   col_7    10000 non-null  float64
 7   col_8    10000 non-null  float64
 8   col_9    10000 non-null  float64
 9   col_10   10000 non-null  float64
 10  col_11   10000 non-null  float64
 11  col_12   10000 non-null  float64
 12  col_13   10000 non-null  float64
 13  col_14   10000 non-null  float64
 14  col_15   10000 non-null  float64
 15  col_16   10000 non-null  float64
 16  col_17   10000 non-null  float64
 17  col_18   10000 non-null  float64
 18  col_19   10000 non-null  float64
 19  col_20   10000 non-null  float64
 20  col_21   10000 non-null  float64
 21  col_22   100

In [485]:
# We can specify how many rows we want during reading the file
pd.read_csv('large_dataset.csv', nrows = 7)

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,...,col_96,col_97,col_98,col_99,col_100
0,-0.26398,-0.846611,1.453268,1.251567,2.39859,...,-0.366923,1.339828,0.012093,-0.49165,0.432837
1,1.258475,1.64203,-0.401637,-0.189602,-1.045877,...,0.292131,-0.163091,0.240092,0.954998,1.271104
2,-0.411287,-0.966434,-0.972459,0.17016,-0.658988,...,-0.494592,0.398968,1.262306,-0.268114,-0.328491
3,1.930533,-0.268566,-0.142144,1.215926,0.589795,...,0.782485,-0.063909,0.062339,0.209707,-1.061527
4,-0.693826,-0.224357,-0.907838,-0.548491,0.524646,...,-1.191671,0.045964,0.013986,0.236547,2.780695
5,0.505134,1.579838,0.524227,-2.020217,0.129764,...,-1.216672,0.963707,-1.444545,-0.7416,1.547811
6,-0.293944,-0.782067,-2.813539,0.256526,-1.034974,...,-1.051752,0.488756,1.074602,0.235672,1.524139


In [488]:
# using head method
data.head()

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,...,col_96,col_97,col_98,col_99,col_100
0,-0.26398,-0.846611,1.453268,1.251567,2.39859,...,-0.366923,1.339828,0.012093,-0.49165,0.432837
1,1.258475,1.64203,-0.401637,-0.189602,-1.045877,...,0.292131,-0.163091,0.240092,0.954998,1.271104
2,-0.411287,-0.966434,-0.972459,0.17016,-0.658988,...,-0.494592,0.398968,1.262306,-0.268114,-0.328491
3,1.930533,-0.268566,-0.142144,1.215926,0.589795,...,0.782485,-0.063909,0.062339,0.209707,-1.061527
4,-0.693826,-0.224357,-0.907838,-0.548491,0.524646,...,-1.191671,0.045964,0.013986,0.236547,2.780695


In [538]:
# If the file size is larger, we can use chunksize as number of rows to read data in pieces
chunker = pd.read_csv('large_dataset.csv', chunksize = 100)

In [539]:
chunker

<pandas.io.parsers.readers.TextFileReader at 0x1ff0f12a850>

In [540]:
for piece in chunker:
    print(type(piece))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
<class 'pand

In [551]:
chunker = pd.read_csv('large_dataset.csv', chunksize = 500)
series = pd.Series([])
for piece in chunker:
    series = series.add(piece['col_3'].value_counts(), fill_value = 0)

In [552]:
series

col_3
-4.116778    1.0
-3.659848    1.0
-3.602267    1.0
-3.474493    1.0
-3.418491    1.0
            ... 
 3.145004    1.0
 3.161427    1.0
 3.243044    1.0
 3.478796    1.0
 3.705894    1.0
Length: 10000, dtype: object