# Chained Assignment

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
import pandas as pd
import numpy as np

# Display all cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

from IPython import get_ipython
ipython = get_ipython()

%matplotlib inline
%config InlineBackend.figure_format = 'svg'

# Set max rows and columns displayed in jupyter
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)

#the following gives access to utils folder
#where utils package stores shared code
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(
                  os.getcwd(),
                  os.pardir)
)

#only add it once
if (PROJECT_ROOT not in sys.path):
    sys.path.append(PROJECT_ROOT)

# autoreload extension
if 'autoreload' not in ipython.extension_manager.loaded:
    %load_ext autoreload

%autoreload 2

# Create a dataframe

In [3]:
df = pd.DataFrame([list('abcd'),
                     list('efgh'),
                     list('ijkl'),
                     list('mnop')],
                    columns=pd.MultiIndex.from_product([['one', 'two'],
                                                        ['first', 'second']]))
df

Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,b,c,d
1,e,f,g,h
2,i,j,k,l
3,m,n,o,p


## Chained index, may return a copy, or a slice

In [4]:
df1=df['one']['second']
df1

0    b
1    f
2    j
3    n
Name: second, dtype: object

In [5]:
df1._is_view
df1._is_copy
# df1.values.base
df.values.base
df1.values.base
df.values.base is df1.values.base

True

<weakref at 0x7eff10e31b30; to 'DataFrame' at 0x7eff580bf190>

array([['a', 'e', 'i', 'm'],
       ['b', 'f', 'j', 'n'],
       ['c', 'g', 'k', 'o'],
       ['d', 'h', 'l', 'p']], dtype=object)

array([['a', 'e', 'i', 'm'],
       ['b', 'f', 'j', 'n'],
       ['c', 'g', 'k', 'o'],
       ['d', 'h', 'l', 'p']], dtype=object)

True

### As long as you dont alter a chaned index then no problem

In [73]:
df['one']['second'][3]

'n'

### Alter it, BOOM!

In [74]:
#change df1, changes DO NOT APPEAR IN DF
df1[0]=5
df1._is_view

df1
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1[0]=5


True

0    5
1    f
2    j
3    n
Name: second, dtype: object

Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,b,c,d
1,e,f,g,h
2,i,j,k,l
3,m,n,o,p


In [75]:
#notice that this change also does not stick 
df['one']['second'] = 3
# the above expands to the following
# df.__getitem__('one').__setitem__('second',3)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['one']['second'] = 3


Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,b,c,d
1,e,f,g,h
2,i,j,k,l
3,m,n,o,p


## Fix the chained index

In [57]:
#this one works
df.loc[:,('one','second')]=9
df

Unnamed: 0_level_0,one,one,two,two
Unnamed: 0_level_1,first,second,first,second
0,a,9,c,d
1,e,9,g,h
2,i,9,k,l
3,m,9,o,p


# Copy or view?

In [76]:
df = pd.DataFrame([[1,2,3,4],[5,6,7,8]], index = ['row1','row2'], 
        columns = ['a','b','c','d'])
df2 = df.iloc[0:2, :]
df3 = df.loc[df['a'] == 1, :]

df
df2
df3

Unnamed: 0,a,b,c,d
row1,1,2,3,4
row2,5,6,7,8


Unnamed: 0,a,b,c,d
row1,1,2,3,4
row2,5,6,7,8


Unnamed: 0,a,b,c,d
row1,1,2,3,4


In [26]:
# df is neither copy nor view
print(f'df {df._is_view}, {df._is_copy}')

# df2 is a view AND a copy
print(f'df2 {df2._is_view}, {df2._is_copy}')

# df3 is not a view, but a copy
print(f'df3 {df3._is_view}, {df3._is_copy}')

#ID is useless here
print(f'{id(df)}, {id(df2)}, {id(df3)}')

df False, None
df2 True, <weakref at 0x7f6b25160ea0; to 'DataFrame' at 0x7f6b246a9ac0>
df3 False, <weakref at 0x7f6b25160ea0; to 'DataFrame' at 0x7f6b246a9ac0>
140098149194432, 140098149192848, 140098151010752


In [59]:
#changes stick
df2.iloc[0,0]=99
df2
df

Unnamed: 0,a,b,c,d
row1,99,2,3,4
row2,5,6,7,8


Unnamed: 0,a,b,c,d
row1,99,2,3,4
row2,5,6,7,8


In [77]:
#change the underlying numpy array memory layout (go from an int to a string), it stops working
df2.iloc[0,0]='a'
df2
df

Unnamed: 0,a,b,c,d
row1,a,2,3,4
row2,5,6,7,8


Unnamed: 0,a,b,c,d
row1,1,2,3,4
row2,5,6,7,8


## ARRRGGGGHHH! This is awful (and it's been that way for years!).  <br><mark>Only way around it; force it to be a copy, then you always know what it is.

In [61]:
df4=df.copy()
print(f'df4 {df4._is_view}, {df4._is_copy}')
df4

df4 False, None


Unnamed: 0,a,b,c,d
row1,99,2,3,4
row2,5,6,7,8


# Chained indexing in 2 lines

In [78]:
def get_data():
    df = pd.DataFrame({'A': range(0, 5), 
                       'B': range(10, 15),
                       'C': range(100, 105)})
    return df
X = get_data()
X

# #as long as you dont alter it does not matter
# X[X['B'] > 12]['C']
# #alter it this way, you get the error
# X[X['B'] > 12]['C'] = 999
#this works
# X.loc[X['B'] > 12,:'C' ]=999

Unnamed: 0,A,B,C
0,0,10,100
1,1,11,101
2,2,12,102
3,3,13,103
4,4,14,104


In [79]:
# create a new DataFrame based on the filtered original
temp = X.loc[X['C'] > 101]
temp.loc[2, 'C'] = 999


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


## From stackoverflow
https://stackoverflow.com/questions/26879073/checking-whether-data-frame-is-copy-or-view-in-pandas


In [12]:
# Make two data frames that are views of same data.
df = pd.DataFrame([[1,2,3,4],[5,6,7,8]], index = ['row1','row2'], 
       columns = ['a','b','c','d'])
df2 = df.iloc[0:2,:]
display(df)

# Demonstrate they are views:
df.iloc[0,0] = 99
display(df)

print(f'df2.iloc[0,0] is {df2.iloc[0,0]}')

# Now try and compare the id on values attribute
# Different despite being views! 
print(f'id(df.values)  is {id(df.values)}')
print(f'id(df2.values) is {id(df2.values)}')
print()
print(f'df2._is_view is {df2._is_view}')
print(f'df2._is_copy is {df2._is_copy}')
print(f'df._is_view is {df._is_view}')
print(f'df._is_copy is {df2._is_copy}')
print()
# And we can of course compare df and df2
print(f'df is df2 is {df is df2}')
print(f'df.values.base is df2.values.base is {df.values.base is df2.values.base}')



Unnamed: 0,a,b,c,d
row1,1,2,3,4
row2,5,6,7,8


Unnamed: 0,a,b,c,d
row1,99,2,3,4
row2,5,6,7,8


df2.iloc[0,0] is 99
id(df.values)  is 140620607509648
id(df2.values) is 140620607509648

df2._is_view is True


AttributeError: 'DataFrame' object has no attribute 'is_copy'