# Other bits and pieces of Pandas

Some other useful features within Panda

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Background info


zip() produces an iterable of tuples from one or more arrays

In [None]:
# what does zip() do

languages = ['Java', 'Python', 'JavaScript']
versions = [14, 3, 6]

result = zip(languages, versions)
print(list(result))
print(type(result))

[('Java', 14), ('Python', 3), ('JavaScript', 6)]
<class 'zip'>


In [None]:
# using zip to make multiple iterations make more sense

for languages, versions in zip(languages, versions):
    print(languages)
    print(versions)

Java
14
Python
3
JavaScript
6


In [None]:
# pairing two lists from a list of lists
array_test=[['Java', 'Python', 'JavaScript'],[14, 3, 6]]

print(list(zip(*array_test)))

[('Java', 14), ('Python', 3), ('JavaScript', 6)]


# Multiindexing

This is a hierarchical or multilevel indexing.  It allows you to store data of more than two dimensions in a 2D data frame

It allows for novel groupby, pivoting and reshaping of data

see  

https://pandas.pydata.org/docs/user_guide/advanced.html

In [None]:
# set up two arrays for the indices,  these are nested indices,   with entries one and two for each name

arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]

# now zip the arrays into a tuple

tuples = list(zip(*arrays))

print(tuples)
print("\n\n")

# now form these into a multiindex

index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

print(index)


[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]



MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])


In [None]:
# put random values into a multiindexed series

s = pd.Series(np.random.randn(8), index=index)

#notice, the multiindex was use as the index, along rows

# this looks like a  pivot table

s

first  second
bar    one      -0.728443
       two       2.417960
baz    one      -0.406664
       two       0.396180
foo    one       0.859420
       two       0.184724
qux    one       1.494541
       two      -0.353738
dtype: float64

In [None]:
# creating a multi-indexed data frame
# notice this is using the index, so this is multiindexing of rows

df = pd.DataFrame(np.random.randn(8, 4), index=arrays)

df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,-0.708506,-1.17506,0.15804,-0.191564
bar,two,-0.929984,0.414279,0.905523,1.479015
baz,one,-0.013717,-0.408731,-0.837577,-0.32507
baz,two,-0.586257,0.323263,1.293287,1.396531
foo,one,0.036805,0.049027,0.595465,-0.082006
foo,two,0.667541,-1.314273,0.092092,-0.665861
qux,one,0.28606,-0.823621,0.247811,0.946388
qux,two,1.132502,-0.876635,-1.52699,-0.320568


In [None]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [None]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

In [None]:
df.loc['bar']

Unnamed: 0,0,1,2,3
one,-0.708506,-1.17506,0.15804,-0.191564
two,-0.929984,0.414279,0.905523,1.479015


In [None]:
df.loc["bar"].loc["two"]

0   -0.929984
1    0.414279
2    0.905523
3    1.479015
Name: two, dtype: float64

In [None]:
# we can multiindex columns als well

df2 = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)

df2

first,bar,bar,baz,baz,foo,foo,qux,qux
second,one,two,one,two,one,two,one,two
A,0.727889,-0.491036,-1.403808,-1.124257,0.685248,-0.450954,0.102672,0.005423
B,1.061178,0.405675,0.351293,-0.046021,0.184733,-0.397038,0.850494,-0.462915
C,0.338487,-0.77801,-1.152643,0.0966,1.253571,0.687621,1.876058,-0.132846


In [None]:
# intersting slices are now possible

df2["bar"]

second,one,two
A,0.727889,-0.491036
B,1.061178,0.405675
C,0.338487,-0.77801


In [None]:
df2["bar","one"]

A    0.727889
B    1.061178
C    0.338487
Name: (bar, one), dtype: float64

In [None]:
# Getting an example data set loaded up

In [None]:
url="https://data.buffalony.gov/resource/fkfv-wqrx.csv"

hydrants=pd.read_csv(url)

In [None]:
hydrants.head()

Unnamed: 0,the_geom,objectid,hydrant_id,st_number,st_name,cross_st,location,hyd_type,hyd_color,datecollec,x,y,map_num,map_quad,hyd_num,insp_dist,comments
0,POINT (-78.847815301278 42.931013768472),6453,6C-0118,OPP 2254,MAIN ST,FLORENCE AVE,MAIN ST,MUELLER,WHITE,,1077462.97,1068041.44,6,6C,118,5,
1,POINT (-78.887261638708 42.925981934471),6045,5C-0117,312,BIRD AVE,PARKDALE AVE,NWC,K-81,RED,,1066892.43,1066243.28,5,5C,117,2,
2,POINT (-78.834552096816 42.901953752276),1275,12B-0040,288,Loepere St,Walden Ave,,HOWARD,RED,2006-10-06T00:00:00.000Z,1080983.3,1057439.66,12,12B,40,6,
3,POINT (-78.878470724291 42.869767136562),2437,14C-0091,,S MICHIGAN AVE,FUHRMANN BLVD,,K-81,BLUE,,1069175.03,1045747.54,14,14C,91,11,
4,POINT (-78.887564891849 42.93677043205),5871,5B-0025,673,REES ST,,,K-81,YELLOW,,1066825.43,1070175.44,5,5A,146,3,


In [None]:
hydrants.shape

(1000, 17)

In [None]:
# Dealing with NaN values

In [None]:
# counts of missing data,  pd.isna gives us the binary coding of missing entries

np.sum(pd.isna(hydrants['st_number']))

6

In [None]:
hydrants.loc[pd.isna(hydrants['st_number']),:]

Unnamed: 0,the_geom,objectid,hydrant_id,st_number,st_name,cross_st,location,hyd_type,hyd_color,datecollec,x,y,map_num,map_quad,hyd_num,insp_dist,comments
3,POINT (-78.878470724291 42.869767136562),2437,14C-0091,,S MICHIGAN AVE,FUHRMANN BLVD,,K-81,BLUE,,1069175.03,1045747.54,14,14C,91,11,
86,POINT (-78.814897504857 42.842680517601),4603,21B-0169,,MCKINLEY PKWY,COOLIDGE RD,HYD SEC MCKINLEY PKWY & COOLIDGE,MATHEWS,RED,,1086189.97,1035822.67,0,21B,0,0,HYD SEC MCKINLEY PKWY & COOLIDGE
204,POINT (-78.878237096688 42.878963730545),2370,14B-0196,,COMMERCIAL ST,PERRY BLVD,84.8' N 1.5' W of NWC PERRY BLVD,MUELLER,BLUE,,1069249.4,1049098.98,14,14B,196,11,
690,POINT (-78.845428565331 42.955192853221),5186,3A-0104,,RACHEL VINCENT WAY,STARIN AVE,2'S 402'W OF W CURB OF STARIN AVE,K-81,RED,,1078129.62,1076851.52,3,3A,104,4,
920,POINT (-78.866870884426 42.88212771143),2371,14B-0197,,DELAMR MITCHELL DRIVE,,282'N 8'E NEC S.DIVISION AND DELMAR MITCHELL,MUELLER,BLUE,,1072299.77,1050241.62,0,14B,0,0,282'N 8'E NEC S.DIVISION AND DELMAR MITCHELL
966,POINT (-78.875213089806 42.954692357742),4814,2A-0098,,Delaware Loop W,,,,RED,,1070155.66,1076695.38,0,,0,0,


In [None]:
np.sum(pd.isna(hydrants['datecollec']))

92

In [None]:
# I want only the date stamped hydrants - notna is the opposite of isna

hydrants_wd=hydrants.loc[pd.notna(hydrants['datecollec']),:]

sum(pd.isna(hydrants_wd['datecollec']))

0

# Date and time handling

In [None]:
import datetime

temp=pd.to_datetime(hydrants_wd.datecollec)



In [None]:
hydrants_wd.dtypes

the_geom       object
objectid        int64
hydrant_id     object
st_number      object
st_name        object
cross_st       object
location       object
hyd_type       object
hyd_color      object
datecollec     object
x             float64
y             float64
map_num         int64
map_quad       object
hyd_num         int64
insp_dist       int64
comments       object
dtype: object