# Pandas

In [3]:
import pandas as pd
series = pd.date_range(11042019,14042019)
series

DatetimeIndex(['1970-01-01 00:00:00.011042019'], dtype='datetime64[ns]', freq='D')

# Series

In [6]:
import pandas as pd
obj = pd.Series([4, 7, -5, 3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

# Values

In [3]:
 obj.values

array([ 4,  7, -5,  3], dtype=int64)

# Index

In [4]:
obj.index

RangeIndex(start=0, stop=4, step=1)

# 
To create a Series with an index identifying each data point
with a label

In [9]:
 obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
 obj2   

d    4
b    7
a   -5
c    3
dtype: int64

In [10]:
obj2.index

Index(['d', 'b', 'a', 'c'], dtype='object')

In [11]:
obj2['a']

-5

In [12]:
obj2['c']

3

In [13]:
obj2[['c','a','d']]

c    3
a   -5
d    4
dtype: int64

# All Python Operators can apply here

In [14]:
obj2[obj2 == 3]

c    3
dtype: int64

In [15]:
obj2*2

d     8
b    14
a   -10
c     6
dtype: int64

In [16]:
obj2**2

d    16
b    49
a    25
c     9
dtype: int64

In [18]:
import numpy as np
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

In [19]:
'b' in obj2

True

In [20]:
'f' in obj2

False

In [21]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

# Note :
 You can override this by passing the dict keys in the order you
want them to appear in the resulting Series

In [28]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

# “missing” or “NA” Data
The " isnull " and " notnull " functions in pandas should be used to detect missing data

In [29]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [30]:
 pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

# 
A useful Series feature for many applications is that it automatically aligns by index
label in arithmetic operations

In [31]:
obj3 + obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

# 
Both the Series object itself and its index have a name attribute, which integrates with
other key areas of pandas functionality

In [32]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

# 
A Series’s index can be altered in-place by assignment

In [33]:
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [34]:
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

# DataFrame

# 
There are many ways to construct a DataFrame, though one of the most common is
from a dict of equal-length lists or NumPy arrays

In [3]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'year': [2000, 2001, 2002, 2001, 2002, 2003],
 'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


# 
For large DataFrames, the head method selects only the first five rows

In [37]:
frame.head()

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


# 
If you specify a sequence of columns, the DataFrame’s columns will be arranged in
that order

In [4]:
import pandas as pd
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


# 
If you pass a column that isn’t contained in the dict, it will appear with missing values
in the result

In [5]:
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
       index=['one', 'two', 'three', 'four',
       'five', 'six'])
frame2 

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [6]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

#  Column Retrive
A column in a DataFrame can be retrieved as a Series either by dict-like notation or
by attribute

# 
frame2[column] works for any column name, but frame2.column
only works when the column name is a valid Python variable
name

In [7]:
frame2['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [8]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

#  Row Retrive
Rows can also be retrieved by position or name with the special " loc " attribute 

In [9]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

#  Column Assigned 
Columns can be modified by assignment. For example, the empty 'debt' column
could be assigned a scalar value or an array of values

In [12]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [17]:
import numpy as np
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


#  Inserting missing values
When you are assigning lists or arrays to a column, the value’s length must match the
length of the DataFrame. If you assign a Series, its labels will be realigned exactly to
the DataFrame’s index, inserting missing values in any holes

In [18]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


#  Creat new column
Assigning a column that doesn’t exist will create a new column.
Add a new column of boolean values where the state
column equals 'Ohio'

#  Note :
New columns cannot be created with the frame2.eastern syntax.

In [19]:
frame2['eastern'] = frame2.state == 'Ohio'
frame2

Unnamed: 0,year,state,pop,debt,eastern
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,-1.2,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,-1.5,False
five,2002,Nevada,2.9,-1.7,False
six,2003,Nevada,3.2,,False


#  Del
The del method can then be used to remove this column

In [20]:
del frame2['eastern']
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

#  Note for above :
The column returned from indexing a DataFrame is a view on the
underlying data, not a copy. Thus, any in-place modifications to the
Series will be reflected in the DataFrame. The column can be
explicitly copied with the Series’s copy method

# 
Another common form of data is a nested dict of dicts.
If the nested dict is passed to the DataFrame, pandas will interpret the outer dict keys
as the columns and the inner keys as the row indices

In [21]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
       'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop)
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


#  Transpose
You can transpose the DataFrame (swap rows and columns) with similar syntax to a
NumPy array

In [22]:
frame3.T

Unnamed: 0,2000,2001,2002
Nevada,,2.4,2.9
Ohio,1.5,1.7,3.6


# 
The keys in the inner dicts are combined and sorted to form the index in the result.
This isn’t true if an explicit index is specified

In [23]:
pd.DataFrame(pop, index=[2001, 2002, 2003])

AttributeError: 'list' object has no attribute 'astype'

# 
Dicts of Series are treated in much the same way

In [25]:
frame3

Unnamed: 0,Nevada,Ohio
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


In [24]:
pdata = {'Ohio': frame3['Ohio'][:-1],
       'Nevada': frame3['Nevada'][:2]}
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2000,1.5,
2001,1.7,2.4


#  Column and Index name
If a DataFrame’s index and columns have their name attributes set, these will also be
displayed

In [26]:
frame3.index.name = 'year'; frame3.columns.name = 'state'
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2000,,1.5
2001,2.4,1.7
2002,2.9,3.6


# 
As with Series, the values attribute returns the data contained in the DataFrame as a
two-dimensional ndarray

In [27]:
frame3.values

array([[nan, 1.5],
       [2.4, 1.7],
       [2.9, 3.6]])

# 
If the DataFrame’s columns are different dtypes, the dtype of the values array will be
chosen to accommodate all of the columns

In [28]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [29]:
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, -1.2],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, -1.5],
       [2002, 'Nevada', 2.9, -1.7],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

#  Index Objects

# 
pandas’s Index objects are responsible for holding the axis labels and other metadata
(like the axis name or names). Any array or other sequence of labels you use when
constructing a Series or DataFrame is internally converted to an Index

In [30]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [31]:
 index[1:]

Index(['b', 'c'], dtype='object')

# 
Index objects are immutable and thus can’t be modified by the user

In [32]:
index[1] = 'd' # TypeError

TypeError: Index does not support mutable operations

In [33]:
labels = pd.Index(np.arange(3))
labels

Int64Index([0, 1, 2], dtype='int64')

In [34]:
obj2 = pd.Series([1.5, -2.5, 0], index=labels)
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [35]:
obj2.index is labels

True

# 
Unlike Python sets, a pandas Index can contain duplicate labels

In [36]:
dup_labels = pd.Index(['foo', 'foo', 'bar', 'bar'])
dup_labels


Index(['foo', 'foo', 'bar', 'bar'], dtype='object')

# 
Selections with duplicate labels will select all occurrences of that label.
Each Index has a number of methods and properties for set logic, which answer other
common questions about the data it contains. Some useful ones are summarized Selections with duplicate labels will select all occurrences of that label.
Each Index has a number of methods and properties for set logic, which answer other
common questions about the data it contains. Some useful ones are summarized 

# 
 Method                     Description
append           Concatenate with additional Index objects, producing a new Index
difference       Compute set difference as an Index
intersection     Compute set intersection
union            Compute set union
isin             Compute boolean array indicating whether each value is contained in the passed collection
delete           Compute new Index with element at index i deleted
drop             Compute new Index by deleting passed values
insert           Compute new Index by inserting element at index i
is_monotonic     Returns True if each element is greater than or equal to the previous element
is_unique        Returns True if the Index has no duplicate values
unique           Compute the array of unique values in the Index

#   Essential Functionalities

#  1. Reindexing
An important method on pandas objects is reindex, which means to create a new
object with the data conformed to a new index. Consider an example

In [37]:
obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [38]:
obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

#  ffill   Method
For ordered data like time series, it may be desirable to do some interpolation or fill‐
ing of values when reindexing. The method option allows us to do this, using a
method such as ffill, which forward-fills the values

In [39]:
obj3 = pd.Series(['blue', 'purple', 'yellow'], index=[0, 2, 4])
obj3

0      blue
2    purple
4    yellow
dtype: object

In [40]:
 obj3.reindex(range(6), method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

#  Reindex Row
With DataFrame, reindex can alter either the (row) index, columns, or both. When
passed only a sequence, it reindexes the rows in the result

In [41]:
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
       index=['a', 'c', 'd'],
       columns=['Ohio', 'Texas', 'California'])
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [42]:
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


#  Reindex Column

In [43]:
states = ['Texas', 'Utah', 'California']
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


#  Loc

In [44]:
frame.loc[['a', 'b', 'c', 'd'], states]

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  """Entry point for launching an IPython kernel.


Unnamed: 0,Texas,Utah,California
a,1.0,,2.0
b,,,
c,4.0,,5.0
d,7.0,,8.0


#  Reindex function arguments
 
 Argument                     Description
index                    New sequence to use as index. Can be Index instance or any other sequence-like Python data structure.                            An Index will be used exactly as is without any copying.
method                   Interpolation (fill) method; 'ffill' fills forward, while 'bfill' fills backward.
fill_value               Substitute value to use when introducing missing data by reindexing.
limit                    When forward- or backfilling, maximum size gap (in number of elements) to fill.
tolerance                When forward- or backfilling, maximum size gap (in absolute numeric distance) to fill for inexact                                matches.
level                    Match simple Index on level of MultiIndex; otherwise select subset of.
copy                     If True, always copy underlying data even if new index is equivalent to old index; if False, do not                              copy the data when the indexes are equivalent.

# 
  Argument                              Description
index               New sequence to use as index. Can be Index instance or any other sequence-like Python data structure. An
                    Index will be used exactly as is without any copying.
method              Interpolation (fill) method; 'ffill' fills forward, while 'bfill' fills backward.
fill_value          Substitute value to use when introducing missing data by reindexing.
limit               When forward- or backfilling, maximum size gap (in number of elements) to fill.
tolerance           When forward- or backfilling, maximum size gap (in absolute numeric distance) to fill for inexact matches.
level               Match simple Index on level of MultiIndex; otherwise select subset of.
copy                If True, always copy underlying data even if new index is equivalent to old index; if False, do not copy
                    the data when the indexes are equivalent

#  Dropping Entries from an Axis

# 
Dropping one or more entries from an axis is easy if you already have an index array
or list without those entries. As that can require a bit of munging and set logic, the
drop method will return a new object with the indicated value or values deleted from
an axis.


In [1]:
import pandas as pd
import numpy as np
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [2]:
new_obj = obj.drop('c')
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [3]:
obj.drop(['d', 'c'])

a    0.0
b    1.0
e    4.0
dtype: float64

# 
With DataFrame, index values can be deleted from either axis. To illustrate this, we
first create an example DataFrame

In [4]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#  Row drop
Calling drop with a sequence of labels will drop values from the row labels (axis 0)

In [5]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


#  Column drop
You can drop values from the columns by passing axis=1 or axis='columns'

In [6]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [7]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


# 
Many functions, like drop, which modify the size or shape of a Series or DataFrame,
can manipulate an object in-place without returning a new object

#  Note : ***
Be careful with the inplace, as it destroys any data that is dropped.

In [None]:
obj.drop('c', inplace=True)

#  Indexing, Selection, and Filtering

# 
Series indexing (obj[...]) works analogously to NumPy array indexing, except you
can use the Series’s index values instead of only integers.

In [8]:
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64

In [9]:
obj['b']

1.0

In [10]:
obj[1]

1.0

In [11]:
obj[2:4]

c    2.0
d    3.0
dtype: float64

In [12]:
obj[['b', 'a', 'd']]

b    1.0
a    0.0
d    3.0
dtype: float64

In [13]:
obj[[1, 3]]

b    1.0
d    3.0
dtype: float64

In [14]:
obj[obj < 2]

a    0.0
b    1.0
dtype: float64

#  Note ( important ) :
Slicing with labels behaves differently than normal Python slicing in that the end‐
point is inclusive

In [15]:
obj['b':'c']

b    1.0
c    2.0
dtype: float64

#  
Setting using these methods modifies the corresponding section of the Series

In [18]:
obj['b':'c'] = 5
obj

a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64

# 
Indexing into a DataFrame is for retrieving one or more columns either with a single
value or sequence

In [19]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
        index=['Ohio', 'Colorado', 'Utah', 'New York'],
        columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [20]:
data['two']

Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32

In [21]:
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


# 
Indexing like this has a few special cases. First, slicing or selecting data with a boolean
array

In [22]:
data[:2]

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7


In [26]:
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


# 
Another use case is in indexing with a boolean DataFrame, such as one produced by a
scalar comparison

In [27]:
data < 5

Unnamed: 0,one,two,three,four
Ohio,True,True,True,True
Colorado,True,False,False,False
Utah,False,False,False,False
New York,False,False,False,False


#  Note : ( for excel or table )
we can take data less then and greater then   ???????????????????????????????

In [41]:
data[data < 5] = 0
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


#  Selection with loc and iloc
For DataFrame label-indexing on the rows, I introduce the special indexing operators
loc and iloc. They enable you to select a subset of the rows and columns from a
DataFrame with NumPy-like notation using either axis labels (loc) or integers
(iloc).

#  Select a single Row and  " multiple "  Columns by " Label " using " loc "

In [42]:
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [43]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

#   Similar selections with integers using " iloc "
2 is row index no. and [3 , 0 , 1] are column index no.

In [44]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

# 
Row with labeles 

In [45]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

# 
Rows with Columns lables

In [46]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


#  Both indexing functions work with slices in addition to single labels or lists of labels

In [47]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [48]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


#   Indexing options with DataFrame

In [None]:
  Type                             Notes
df[val]                      Select single column or sequence of columns from the DataFrame; special case
                             conveniences: boolean array (filter rows), slice (slice rows), or boolean DataFrame
                             (set values based on some criterion)
df.loc[val]                  Selects single row or subset of rows from the DataFrame by label
df.loc[:, val]               Selects single column or subset of columns by label
df.loc[val1, val2]           Select both rows and columns by label
df.iloc[where]               Selects single row or subset of rows from the DataFrame by integer position
df.iloc[:, where]            Selects single column or subset of columns by integer position
df.iloc[where_i, where_j]    Select both rows and columns by integer position
df.at[label_i, label_j]      Select a single scalar value by row and column label
df.iat[i, j]                 Select a single scalar value by row and column position (integers)
reindex method               Select either rows or columns by labels
get_value, set_value methods - Select single value by row and column label

#  Integer Indexes

In [5]:
import pandas as pd
import numpy as np
ser = pd.Series(np.arange(3.), index=['a', 'b', 'c'])
ser

a    0.0
b    1.0
c    2.0
dtype: float64

In [6]:
ser[-1]

2.0

In [7]:
ser[:1]

a    0.0
dtype: float64

# Arithmetic and Data Alignment

# 
An important pandas feature for some applications is the behavior of arithmetic
between objects with different indexes. When you are adding together objects, if any
index pairs are not the same, the respective index in the result will be the union of the
index pairs. For users with database experience, this is similar to an automatic outer
join on the index labels.

In [9]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
         index=['a', 'c', 'e', 'f', 'g'])
s1

a    7.3
c   -2.5
d    3.4
e    1.5
dtype: float64

In [10]:
s2

a   -2.1
c    3.6
e   -1.5
f    4.0
g    3.1
dtype: float64

In [11]:
s1 + s2

a    5.2
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

# 
In the case of DataFrame, alignment is performed on both the rows and the columns

In [12]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
        index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
        index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1

Unnamed: 0,b,c,d
Ohio,0.0,1.0,2.0
Texas,3.0,4.0,5.0
Colorado,6.0,7.0,8.0


In [13]:
df2

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [14]:
df1 + df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,3.0,,6.0,
Oregon,,,,
Texas,9.0,,12.0,
Utah,,,,


In [15]:
df1 - df2

Unnamed: 0,b,c,d,e
Colorado,,,,
Ohio,-3.0,,-2.0,
Oregon,,,,
Texas,-3.0,,-2.0,
Utah,,,,


# 
If you add DataFrame objects with no column or row labels in common, the result
will contain all nullsIf you add DataFrame objects with no column or row labels in common, the result
will contain all nulls

In [16]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})
df1

Unnamed: 0,A
0,1
1,2


In [17]:
df2

Unnamed: 0,B
0,3
1,4


In [18]:
df1 + df2

Unnamed: 0,A,B
0,,
1,,


#  Arithmetic methods with fill values

# 
In arithmetic operations between differently indexed objects, you might want to fill
with a special value, like 0, when an axis label is found in one object but not the other

In [19]:
df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)),
        columns=list('abcd'))
df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)),
        columns=list('abcde'))
df2.loc[1, 'b'] = np.nan
df1

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [21]:
df2     # why NaN without loc ???

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [22]:
df2

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,4.0
1,5.0,,7.0,8.0,9.0
2,10.0,11.0,12.0,13.0,14.0
3,15.0,16.0,17.0,18.0,19.0


In [23]:
df1 + df2

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,
1,9.0,,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


#  Note important  : fill_value retrive remaing column's elements
Using the add method on df1, pass df2 and an argument to fill_value

In [24]:
df1.add(df2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,2.0,4.0,6.0,4.0
1,9.0,5.0,13.0,15.0,9.0
2,18.0,20.0,22.0,24.0,14.0
3,15.0,16.0,17.0,18.0,19.0


#  Operations 

In [25]:
1 / df1

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


# 
same result with different command see below list of commands

In [26]:
df1.rdiv(1) 

Unnamed: 0,a,b,c,d
0,inf,1.0,0.5,0.333333
1,0.25,0.2,0.166667,0.142857
2,0.125,0.111111,0.1,0.090909


# 
when reindexing a Series or DataFrame, you can also specify a different fill
value:

In [27]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,0.0,1.0,2.0,3.0,0
1,4.0,5.0,6.0,7.0,0
2,8.0,9.0,10.0,11.0,0


#   Flexible arithmetic methods

# 
 Method                      Description
add, radd               Methods for addition (+)
sub, rsub               Methods for subtraction (-)
div, rdiv               Methods for division (/)
floordiv, rfloordiv     Methods for oor division (//)
mul, rmul               Methods for multiplication (*)
pow, rpow               Methods for exponentiation (**)

# Operations between DataFrame and Series 

# 
As with NumPy arrays of different dimensions, arithmetic between DataFrame and
Series is also defined. Example of NumPy

In [28]:
arr = np.arange(12.).reshape((3, 4))
arr

array([[ 0.,  1.,  2.,  3.],
       [ 4.,  5.,  6.,  7.],
       [ 8.,  9., 10., 11.]])

In [29]:
arr[0]

array([0., 1., 2., 3.])

# 
When we subtract arr[0] from arr, the subtraction is performed once for each row

In [30]:
arr - arr[0]

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

# 
 Operations between a DataFrame and a Series are similar

In [31]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
         columns=list('bde'),
         index=['Utah', 'Ohio', 'Texas', 'Oregon'])
series = frame.iloc[0]
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [32]:
series

b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64

# 
By default, arithmetic between DataFrame and Series matches the index of the Series
on the DataFrame’s columns, broadcasting down the rows

In [33]:
frame - series

Unnamed: 0,b,d,e
Utah,0.0,0.0,0.0
Ohio,3.0,3.0,3.0
Texas,6.0,6.0,6.0
Oregon,9.0,9.0,9.0


# 
If an index value is not found in either the DataFrame’s columns or the Series’s index,
the objects will be reindexed to form the union

In [35]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
series2

b    0
e    1
f    2
dtype: int64

In [36]:
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


# 
If you want to instead broadcast over the columns, matching on the rows, you have to
use one of the arithmetic methods.

In [37]:
frame

Unnamed: 0,b,d,e
Utah,0.0,1.0,2.0
Ohio,3.0,4.0,5.0
Texas,6.0,7.0,8.0
Oregon,9.0,10.0,11.0


In [39]:
series3 = frame['d']
series3

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64

# 
The axis number that you pass is the axis to match on. In this case we mean to match
on the DataFrame’s row index (axis='index' or axis=0) and broadcast across

In [40]:
frame.sub(series3, axis='index')

Unnamed: 0,b,d,e
Utah,-1.0,0.0,1.0
Ohio,-1.0,0.0,1.0
Texas,-1.0,0.0,1.0
Oregon,-1.0,0.0,1.0


#  Function Application and Mapping

# 
NumPy ufuncs (element-wise array methods) also work with pandas objects

In [3]:
import pandas as pd
import numpy as np
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
        index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame

Unnamed: 0,b,d,e
Utah,0.739907,0.019258,-2.480665
Ohio,0.337787,0.231308,-0.906217
Texas,-0.599596,-0.340378,-0.840267
Oregon,0.566871,-0.756557,0.993565


# 
   Absolute ----- ( abs ) means all positve 

In [4]:
np.abs(frame)

Unnamed: 0,b,d,e
Utah,0.739907,0.019258,2.480665
Ohio,0.337787,0.231308,0.906217
Texas,0.599596,0.340378,0.840267
Oregon,0.566871,0.756557,0.993565


# 
Another frequent operation is applying a function on one-dimensional arrays to each
column or row. DataFrame’s apply method does exactly this.
Here the function f, which computes the difference between the maximum and mini‐
mum of a Series, is invoked once on each column in frame. The result is a Series hav‐
ing the columns of frame as its index.

In [5]:
f = lambda x: x.max() - x.min()
frame.apply(f)

b    1.339503
d    0.987865
e    3.474231
dtype: float64

# 
If you pass axis='columns' to apply, the function will be invoked once per row
instead

In [6]:
frame.apply(f, axis='columns')

Utah      3.220573
Ohio      1.244003
Texas     0.499889
Oregon    1.750122
dtype: float64

# 
The function passed to apply need not return a scalar value; it can also return a Series
with multiple values

In [7]:
def f(x):
        return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

Unnamed: 0,b,d,e
min,-0.599596,-0.756557,-2.480665
max,0.739907,0.231308,0.993565


# 
Element-wise Python functions can be used, too. Suppose you wanted to compute a
formatted string from each floating-point value in frame. You can do this with apply
map

In [8]:
import pandas as pd
import numpy as np
format = lambda x: '%.2f' % x
frame.applymap(format)

Unnamed: 0,b,d,e
Utah,0.74,0.02,-2.48
Ohio,0.34,0.23,-0.91
Texas,-0.6,-0.34,-0.84
Oregon,0.57,-0.76,0.99


# 
The reason for the name applymap is that Series has a map method for applying an
element-wise function

In [9]:
frame['e'].map(format)

Utah      -2.48
Ohio      -0.91
Texas     -0.84
Oregon     0.99
Name: e, dtype: object

#  Sorting and Ranking

# 
Sorting a dataset by some criterion is another important built-in operation. To sort
lexicographically by row or column index, use the sort_index method, which returns
a new, sorted object

In [12]:
obj = pd.Series(range(4), index=['d', 'a', 'c', 'b'])
obj.sort_index()

a    1
b    3
c    2
d    0
dtype: int64

# 
With a DataFrame, you can sort by index on either axis

In [13]:
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
        index=['three', 'one'],
        columns=['d', 'a', 'b', 'c'])
frame.sort_index()

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [14]:
frame.sort_index(axis=1)

Unnamed: 0,a,b,c,d
three,1,2,3,0
one,5,6,7,4


# 
The data is sorted in ascending order by default, but can be sorted in descending
order, too

In [15]:
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


# 
To sort a Series by its values, use its sort_values method ( lowest to highest )

In [16]:
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

2   -3
3    2
0    4
1    7
dtype: int64

# 
Any missing values are sorted to the end of the Series by default

In [17]:
obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

# 
When sorting a DataFrame, you can use the data in one or more columns as the sort
keys. To do so, pass one or more column names to the by option of sort_values

In [3]:
import pandas as pd
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
frame

Unnamed: 0,b,a
0,4,0
1,7,1
2,-3,0
3,2,1


# 
 Sort single colom

In [19]:
 frame.sort_values(by='b')

Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


# 
To sort by multiple columns, pass a list of names

In [4]:
frame.sort_values(by=['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1
