# Data Frame Notes

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = {'states':['Himachal Pradesh', 'Uttarakhand', 'Uttar Pradesh', 'Haryana', 'Maharashtra', 'Rajasthan'],
       'year':[2000, 2001, 2002, 2001, 2002, 2003],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [3]:
frame = pd.DataFrame(data)

In [4]:
# The resulting DataFrame will have its index assigned automatically as with series, and the columns are placed in sorted order

frame

Unnamed: 0,states,year,pop
0,Himachal Pradesh,2000,1.5
1,Uttarakhand,2001,1.7
2,Uttar Pradesh,2002,3.6
3,Haryana,2001,2.4
4,Maharashtra,2002,2.9
5,Rajasthan,2003,3.2


In [5]:
# head method

frame.head()

Unnamed: 0,states,year,pop
0,Himachal Pradesh,2000,1.5
1,Uttarakhand,2001,1.7
2,Uttar Pradesh,2002,3.6
3,Haryana,2001,2.4
4,Maharashtra,2002,2.9


In [6]:
frame.head(2)

Unnamed: 0,states,year,pop
0,Himachal Pradesh,2000,1.5
1,Uttarakhand,2001,1.7


In [7]:
# tail method

frame.tail()

Unnamed: 0,states,year,pop
1,Uttarakhand,2001,1.7
2,Uttar Pradesh,2002,3.6
3,Haryana,2001,2.4
4,Maharashtra,2002,2.9
5,Rajasthan,2003,3.2


In [8]:
frame.tail(3)

Unnamed: 0,states,year,pop
3,Haryana,2001,2.4
4,Maharashtra,2002,2.9
5,Rajasthan,2003,3.2


In [9]:
# If you specify a sequence of columns, the DataFrame's will be arranged in that order

pd.DataFrame(data, columns=['year', 'states', 'pop'])

Unnamed: 0,year,states,pop
0,2000,Himachal Pradesh,1.5
1,2001,Uttarakhand,1.7
2,2002,Uttar Pradesh,3.6
3,2001,Haryana,2.4
4,2002,Maharashtra,2.9
5,2003,Rajasthan,3.2


In [10]:
pd.DataFrame(data, columns=['states', 'year', 'pop'])

Unnamed: 0,states,year,pop
0,Himachal Pradesh,2000,1.5
1,Uttarakhand,2001,1.7
2,Uttar Pradesh,2002,3.6
3,Haryana,2001,2.4
4,Maharashtra,2002,2.9
5,Rajasthan,2003,3.2


In [11]:
pd.DataFrame(data, columns=['states', 'pop', 'year'])

Unnamed: 0,states,pop,year
0,Himachal Pradesh,1.5,2000
1,Uttarakhand,1.7,2001
2,Uttar Pradesh,3.6,2002
3,Haryana,2.4,2001
4,Maharashtra,2.9,2002
5,Rajasthan,3.2,2003


In [12]:
pd.DataFrame(data, columns=['pop', 'year', 'states'])

Unnamed: 0,pop,year,states
0,1.5,2000,Himachal Pradesh
1,1.7,2001,Uttarakhand
2,3.6,2002,Uttar Pradesh
3,2.4,2001,Haryana
4,2.9,2002,Maharashtra
5,3.2,2003,Rajasthan


In [13]:
pd.DataFrame(data, columns=['pop', 'states', 'year'])

Unnamed: 0,pop,states,year
0,1.5,Himachal Pradesh,2000
1,1.7,Uttarakhand,2001
2,3.6,Uttar Pradesh,2002
3,2.4,Haryana,2001
4,2.9,Maharashtra,2002
5,3.2,Rajasthan,2003


In [14]:
# If you pass a column that isn't contained in the dict, it will appear with missing values in the result

frame2 = pd.DataFrame(data, columns=['year','states','pop','debt'], index=['one','two','three','four','five','six'])

In [15]:
frame2

Unnamed: 0,year,states,pop,debt
one,2000,Himachal Pradesh,1.5,
two,2001,Uttarakhand,1.7,
three,2002,Uttar Pradesh,3.6,
four,2001,Haryana,2.4,
five,2002,Maharashtra,2.9,
six,2003,Rajasthan,3.2,


In [16]:
frame2.columns

Index(['year', 'states', 'pop', 'debt'], dtype='object')

In [17]:
# A column in a DataFrame can be reterived as a series either by dict like notation or by attribute

frame2['states']

one      Himachal Pradesh
two           Uttarakhand
three       Uttar Pradesh
four              Haryana
five          Maharashtra
six             Rajasthan
Name: states, dtype: object

In [18]:
frame2.year

one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64

In [19]:
frame2.pop

<bound method DataFrame.pop of        year            states  pop debt
one    2000  Himachal Pradesh  1.5  NaN
two    2001       Uttarakhand  1.7  NaN
three  2002     Uttar Pradesh  3.6  NaN
four   2001           Haryana  2.4  NaN
five   2002       Maharashtra  2.9  NaN
six    2003         Rajasthan  3.2  NaN>

In [20]:
# rows can be also be reterived by position or name with the special loc attribute

frame2.loc['three']

year               2002
states    Uttar Pradesh
pop                 3.6
debt                NaN
Name: three, dtype: object

In [21]:
# columns can be modified by assignment. For exapmle, the empty 'debt' column could be assigned a scalar value or any array of values

frame2['debt'] = 16.5

In [22]:
frame2

Unnamed: 0,year,states,pop,debt
one,2000,Himachal Pradesh,1.5,16.5
two,2001,Uttarakhand,1.7,16.5
three,2002,Uttar Pradesh,3.6,16.5
four,2001,Haryana,2.4,16.5
five,2002,Maharashtra,2.9,16.5
six,2003,Rajasthan,3.2,16.5


In [23]:
frame2['debt'] = np.arange(6.)

In [24]:
frame2

Unnamed: 0,year,states,pop,debt
one,2000,Himachal Pradesh,1.5,0.0
two,2001,Uttarakhand,1.7,1.0
three,2002,Uttar Pradesh,3.6,2.0
four,2001,Haryana,2.4,3.0
five,2002,Maharashtra,2.9,4.0
six,2003,Rajasthan,3.2,5.0


In [25]:
# when you are assigining lists or arrays to a column, the value's length must match the length of the DataFrame.
# If you assign a series, its labels will be religned exactly to the DataFrame's index, inserting missing values in any holes

val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])

In [26]:
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [27]:
frame2['debt'] = val

In [28]:
frame2

Unnamed: 0,year,states,pop,debt
one,2000,Himachal Pradesh,1.5,
two,2001,Uttarakhand,1.7,-1.2
three,2002,Uttar Pradesh,3.6,
four,2001,Haryana,2.4,-1.5
five,2002,Maharashtra,2.9,-1.7
six,2003,Rajasthan,3.2,


In [29]:
data2 = {'states':['Himachal Pradesh', 'Himachal Pradesh', 'Himachal Pradesh', 'Haryana', 'Maharashtra', 'Rajasthan'],
       'year':[2000, 2001, 2002, 2001, 2002, 2003],
       'pop':[1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

In [30]:
frame3 = pd.DataFrame(data2, index=['one','two','three','four','five','six'])

In [31]:
frame3

Unnamed: 0,states,year,pop
one,Himachal Pradesh,2000,1.5
two,Himachal Pradesh,2001,1.7
three,Himachal Pradesh,2002,3.6
four,Haryana,2001,2.4
five,Maharashtra,2002,2.9
six,Rajasthan,2003,3.2


In [32]:
# Assigning a column that doesn't exist will create a new column. The del keyword will delete columns as with a dict
# As an example of del, i first add a new column of boolean values where the state column equal to 'Himachal Pradesh'

frame3['eastern'] = frame3.states=='Himachal Pradesh'

In [33]:
frame3

Unnamed: 0,states,year,pop,eastern
one,Himachal Pradesh,2000,1.5,True
two,Himachal Pradesh,2001,1.7,True
three,Himachal Pradesh,2002,3.6,True
four,Haryana,2001,2.4,False
five,Maharashtra,2002,2.9,False
six,Rajasthan,2003,3.2,False


In [34]:
# NOTE: New column cannot be created with frame3.eastern syntax
# The del method can than be used to remove this column

del frame3['eastern']

In [35]:
frame3

Unnamed: 0,states,year,pop
one,Himachal Pradesh,2000,1.5
two,Himachal Pradesh,2001,1.7
three,Himachal Pradesh,2002,3.6
four,Haryana,2001,2.4
five,Maharashtra,2002,2.9
six,Rajasthan,2003,3.2


In [36]:
frame3.columns

Index(['states', 'year', 'pop'], dtype='object')

In [37]:
# The common returned from indexing a DataFrame is a view on the underlying data not a copy
# NOTE: Thus, in any modifications to series will be reflected in DataFrame
# The column can be explicitly copied with the series's copy method

# Another common form of data is nested dict of dicts

pop = {'Bugatti': {2001: 2.4, 2002: 2.9},
      'Bently': {2000:1.5, 2001: 1.7, 2002: 3.6}}

In [38]:
# If the nested dict is passed to the DataFrame, Pandas will interpret the outer dict keys as the columns and the inner keys as the row indices

frame4 = pd.DataFrame(pop)

In [39]:
frame4

Unnamed: 0,Bugatti,Bently
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [40]:
# You can transpose the Dataframe, with similar syntax to a Numpy array

frame4.T

Unnamed: 0,2001,2002,2000
Bugatti,2.4,2.9,
Bently,1.7,3.6,1.5


In [41]:
# The keys of inner dicts are combined and sorted to form the index in the result
# This isn't true if an explicit index is specified 

pd.DataFrame(pop, index=[2001, 2002, 2003])

Unnamed: 0,Bugatti,Bently
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [42]:
# Dict of series are treated in much the same way

pdata={'Bugatti': frame4['Bugatti'][:-1],
      'Bently':frame4['Bently'][:2]}

In [43]:
pd.DataFrame(pdata)

Unnamed: 0,Bugatti,Bently
2001,2.4,1.7
2002,2.9,3.6


In [44]:
# If a Dataframe's index and columns have their name attribute set, these wil also be displayed

frame4.index.name='year'; frame4.columns.name='car'

In [45]:
frame4

car,Bugatti,Bently
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [46]:
# As with series, the values attribute returns the data contained in the DataFrame as a two-dimensional ndarray

frame4.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [47]:
# If the DataFrame's columns are different dtypes, the dtype of the values array will be chosen to accomodate all of the columns

frame2.values

array([[2000, 'Himachal Pradesh', 1.5, nan],
       [2001, 'Uttarakhand', 1.7, -1.2],
       [2002, 'Uttar Pradesh', 3.6, nan],
       [2001, 'Haryana', 2.4, -1.5],
       [2002, 'Maharashtra', 2.9, -1.7],
       [2003, 'Rajasthan', 3.2, nan]], dtype=object)

# Index Objects

In [48]:
obj = pd.Series(range(3), index=['a','b','c'])

In [49]:
index = obj.index

In [50]:
index

Index(['a', 'b', 'c'], dtype='object')

In [51]:
index[1:]

Index(['b', 'c'], dtype='object')

In [52]:
# Index objects are immutable and thus can't be modified by the user

# index[1] = 'd' # TypeError

In [53]:
labels = pd.Index(np.arange(3))

In [54]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [55]:
obj2 = pd.Series([100, 200, -300, 0, 7.7])

In [56]:
obj2

0    100.0
1    200.0
2   -300.0
3      0.0
4      7.7
dtype: float64

In [57]:
obj2.index is labels

False

In [58]:
# In addition to being array-like, an Index also behaves like a fixed-size set

frame4

car,Bugatti,Bently
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [59]:
frame4.columns

Index(['Bugatti', 'Bently'], dtype='object', name='car')

In [60]:
'Bugatti' in frame4.index

False

In [61]:
# Unlike python sets, a pandas Index can contain duplicate labels

dup_labels = pd.Index(['anshu', 'sneha', 'samiksha', 'professor'])

In [62]:
dup_labels

Index(['anshu', 'sneha', 'samiksha', 'professor'], dtype='object')

# Some Index Methods and Properties

In [63]:
# Method: append
# Description: Concatenate with additional Index objects, producing a new Index

index1 = pd.Index([1, 2, 3, 4])
index2 = pd.Index([5, 6, 7, 8])

In [64]:
index3 = index1.append(index2)

In [65]:
index3

Int64Index([1, 2, 3, 4, 5, 6, 7, 8], dtype='int64')

In [66]:
index4 = pd.Index([10, 11, 12, 13])

In [67]:
index5 = index3.append(index4)

In [68]:
index5

Int64Index([1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13], dtype='int64')

In [69]:
# Method: difference
# Description: Compute set difference as an Index

index1 = pd.Index(['A', 'B', 'C', 'D'])
index2 = pd.Index(['C', 'D', 'E', 'F'])

In [70]:
index3 = index1.difference(index2)

In [71]:
index3

Index(['A', 'B'], dtype='object')

In [72]:
# You can also use the 'difference' method with other pandas objects, such as Series or DataFrame

series = pd.Series([10, 20, 30, 40], index = ['A', 'B', 'C', 'D'])

In [73]:
index4 = series.index.difference(pd.Index(['C', 'D', 'E']))

In [74]:
index4

Index(['A', 'B'], dtype='object')

In [75]:
# Method:- Intersection
# Description:- Computer set intersection

index1 = pd.Index([1, 2, 3, 4])
index2 = pd.Index([3, 4, 5, 6])

In [76]:
index3 = index1.intersection(index2)

In [77]:
index3

Int64Index([3, 4], dtype='int64')

In [78]:
# You can also use the 'intersection' method with other pandas objects, such as Series or DataFrame

series = pd.Series([10, 20, 30, 40], index = ['A', 'B', 'C', 'D'])

In [79]:
intersection_index = series.index.intersection(pd.Index(['C', 'D', 'E']))

In [80]:
intersection_index

Index(['C', 'D'], dtype='object')

In [81]:
# Method:- union
# Description:- compute set union

index1 = pd.Index(['Anshu', 'Sneha', 'Professor', 'Harshit'])
index2 = pd.Index(['Harshit', 'Shreya', 'Samiksha', 'Sanya'])

In [82]:
index3 = index1.union(index2)

In [83]:
index1

Index(['Anshu', 'Sneha', 'Professor', 'Harshit'], dtype='object')

In [84]:
# You can also use the 'union' method with other pandas objects, such as Series or DataFrame

series = pd.Series([10, 20, 30, 40], index = ['A', 'B', 'C', 'D'])

In [85]:
union_index = series.index.union(pd.Index(['C', 'D', 'E']))

In [86]:
union_index

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [87]:
# Method:- isin
# Description:- Compute boolean array indicating whether each value is contained in the passed collection

series = pd.Series([1, 2, 3, 4, 5])

In [88]:
isin_collection = series.isin([2, 4, 6])

In [89]:
isin_collection

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [90]:
print(isin_collection)

0    False
1     True
2    False
3     True
4    False
dtype: bool


In [91]:
# You can also use the isin method with an Index object

index = pd.Index(['A', 'B', 'C', 'D'])

In [92]:
ic = index.isin(['B', 'D', 'F'])

In [93]:
ic

array([False,  True, False,  True])

In [94]:
print(ic)

[False  True False  True]


In [95]:
# Method:- Delete
# Description:- Compute new Index with element at index i delete

index1 = pd.Index([1, 2, 3, 4, 5])

In [96]:
i = 2

In [97]:
new_index = index1.delete(i)

In [98]:
new_index

Int64Index([1, 2, 4, 5], dtype='int64')

In [99]:
# Method: drop
# Description: Compute new Index by deleting passed values

data = {'Name': ['Anshu', 'Sneha', 'Professor', 'Samiksha', 'Mayank'],
       'Age': [20, 19, 21, 18, 23],
       'City': ['Chandigarh', 'Gurgaon', 'Banglore', 'Delhi', 'Noida']}

In [100]:
df = pd.DataFrame(data)

In [101]:
df

Unnamed: 0,Name,Age,City
0,Anshu,20,Chandigarh
1,Sneha,19,Gurgaon
2,Professor,21,Banglore
3,Samiksha,18,Delhi
4,Mayank,23,Noida


In [102]:
indices_to_drop = [1, 3]

In [103]:
new_df = df.drop(indices_to_drop)

In [104]:
new_df

Unnamed: 0,Name,Age,City
0,Anshu,20,Chandigarh
2,Professor,21,Banglore
4,Mayank,23,Noida


In [105]:
values_to_drop = ['Mayank']

In [106]:
new_df = df.drop(df[df['Name'].isin(values_to_drop)].index)

In [107]:
new_df

Unnamed: 0,Name,Age,City
0,Anshu,20,Chandigarh
1,Sneha,19,Gurgaon
2,Professor,21,Banglore
3,Samiksha,18,Delhi


In [108]:
# Method: insert
# Description: Compute new Index by inserting element at index i

original_index = pd.Series([1, 2, 3, 4, 5])

In [109]:
new_element = 10
insertion_index = 2

In [110]:
new_index = original_index.copy().to_list()
new_index.insert(insertion_index, new_element)

In [111]:
new_index

[1, 2, 10, 3, 4, 5]

In [112]:
new_index = pd.Series(new_index)

In [113]:
new_index

0     1
1     2
2    10
3     3
4     4
5     5
dtype: int64

In [114]:
# Method: is_monotonic
# Description: Returns True if each element is greater than or equal to the previous element

index = pd.Series([1, 2, 3, 3, 4, 5])

In [115]:
is__monotonic = index.is_monotonic

  is__monotonic = index.is_monotonic


In [116]:
is__monotonic

True

In [117]:
print(is__monotonic)

True


In [118]:
index2 = pd.Series([1, 2, 3, 7, 4, 5])

In [119]:
is__monotonic = index2.is_monotonic

  is__monotonic = index2.is_monotonic


In [120]:
is__monotonic

False

In [121]:
print(is__monotonic)

False


In [122]:
# Method: is_unique
# Description: Returns True if the Index has no duplicate value

index = pd.Index([1, 2, 3, 4, 5])

In [123]:
is__unique = index.is_unique

In [124]:
is__unique

True

In [125]:
index2 = pd.Index([1, 2, 3, 4, 4])

In [126]:
is__unique = index2.is_unique

In [127]:
is__unique

False

In [128]:
# Method: unique
# Description: Compute the array of unique values in the Index

index = pd.Index([1, 2, 2, 3, 4, 4, 5])

In [129]:
unique_values = index.unique()

In [130]:
unique_values

Int64Index([1, 2, 3, 4, 5], dtype='int64')

In [131]:
index2 = pd.Index([1, 2, 3, 4, 5])

In [132]:
unique_values = index.unique()

In [133]:
unique_values

Int64Index([1, 2, 3, 4, 5], dtype='int64')