# Other bits and pieces of Pandas

Some other useful features within Panda

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Background info


zip() produces an iterable of tuples from one or more arrays

In [2]:
# what does zip() do

languages = ['Java', 'Python', 'JavaScript']
versions = [14, 3, 6]

result = zip(languages, versions)
print(list(result))
print(type(result))

[('Java', 14), ('Python', 3), ('JavaScript', 6)]
<class 'zip'>


In [3]:
# using zip to make multiple iterations make more sense

for languages, versions in zip(languages, versions):
    print(languages)
    print(versions)

Java
14
Python
3
JavaScript
6


In [4]:
# pairing two lists from a list of lists
array_test=[['Java', 'Python', 'JavaScript'],[14, 3, 6]]

print(list(zip(*array_test)))

[('Java', 14), ('Python', 3), ('JavaScript', 6)]


# Multiindexing

This is a hierarchical or multilevel indexing.  It allows you to store data of more than two dimensions in a 2D data frame

It allows for novel groupby, pivoting and reshaping of data

see  

https://pandas.pydata.org/docs/user_guide/advanced.html

In [5]:
# set up two arrays for the indices,  these are nested indices,   with entries one and two for each name

arrays = [
    ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
    ["one", "two", "one", "two", "one", "two", "one", "two"],
]

# now zip the arrays into a tuple

tuples = list(zip(*arrays))

print(tuples) 
print("\n\n")

# now form these into a multiindex

index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"])

print(index)


[('bar', 'one'), ('bar', 'two'), ('baz', 'one'), ('baz', 'two'), ('foo', 'one'), ('foo', 'two'), ('qux', 'one'), ('qux', 'two')]



MultiIndex([('bar', 'one'),
            ('bar', 'two'),
            ('baz', 'one'),
            ('baz', 'two'),
            ('foo', 'one'),
            ('foo', 'two'),
            ('qux', 'one'),
            ('qux', 'two')],
           names=['first', 'second'])


In [6]:
# put random values into a multiindexed series

s = pd.Series(np.random.randn(8), index=index)

#notice, the multiindex was use as the index, along rows

# this looks like a  pivot table

s

first  second
bar    one      -0.392810
       two      -0.547455
baz    one      -0.363310
       two      -0.210120
foo    one      -0.749565
       two      -0.252857
qux    one      -1.098818
       two      -0.677560
dtype: float64

In [7]:
# creating a multi-indexed data frame
# notice this is using the index, so this is multiindexing of rows

df = pd.DataFrame(np.random.randn(8, 4), index=arrays)

df

Unnamed: 0,Unnamed: 1,0,1,2,3
bar,one,1.388964,0.993054,0.979598,0.558392
bar,two,-2.909657,-1.06987,0.99521,0.494008
baz,one,-1.801273,-1.090733,0.053184,-0.110642
baz,two,-0.015449,-0.972829,0.110901,1.936321
foo,one,1.89685,1.049549,-1.219351,-2.816742
foo,two,1.997833,1.669285,-0.611838,0.972937
qux,one,0.211609,-0.67071,-0.543403,-0.534242
qux,two,-0.217689,-0.121272,-1.224364,-0.675533


In [8]:
index.get_level_values(0)

Index(['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], dtype='object', name='first')

In [9]:
index.get_level_values(1)

Index(['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'], dtype='object', name='second')

In [10]:
df.loc['bar'].

SyntaxError: invalid syntax (3618795554.py, line 1)

In [None]:
df.loc["bar"].loc["two"]

In [None]:
# we can multiindex columns als well

df2 = pd.DataFrame(np.random.randn(3, 8), index=["A", "B", "C"], columns=index)

df2

In [None]:
# intersting slices are now possible

df2["bar"]

In [None]:
df2["bar","one"]

In [None]:
# Getting an example data set loaded up

In [None]:
url="https://data.buffalony.gov/resource/fkfv-wqrx.csv"

hydrants=pd.read_csv(url)

In [None]:
hydrants.head()

In [None]:
hydrants.shape

In [None]:
# Dealing with NaN values

In [None]:
# counts of missing data,  pd.isna gives us the binary coding of missing entries

np.sum(pd.isna(hydrants['st_number']))

In [None]:
hydrants.loc[pd.isna(hydrants['st_number']),:]

In [None]:
np.sum(pd.isna(hydrants['datecollec']))

In [None]:
# I want only the date stamped hydrants - notna is the opposite of isna

hydrants_wd=hydrants.loc[pd.notna(hydrants['datecollec']),:]

sum(pd.isna(hydrants_wd['datecollec']))

# Date and time handling

In [None]:
import datetime

temp=pd.to_datetime(hydrants_wd.datecollec)



In [None]:
hydrants_wd.dtypes