# Pandas

In [44]:
import numpy as np
import pandas as pd
import os

## Benefits of Pandas Library
- Has tools for reading and writing data between many formats (csv, excel, html, db, ...).
- Intelligently grab data based on: indexing, logic, subsetting and more.
- Gracefully handles missing data.
- Easilty adjust and restructure (reshape/wrangle) data.
- Works with data in RAM to run faster (I/O hd access is minimal).
- Allows access to index based on descriptive values (usually strings).

### Pandas (Numpy) Series

In [45]:
# Help access
# help(pd.Series)

In [46]:
# Normal Python lists
myindex = ['USA', 'Canada', 'Mexico']
mydata = [1776, 1867, 1821]
print(type(myindex))
print(type(mydata))

<class 'list'>
<class 'list'>


In [47]:
myseries = pd.Series(data = mydata)
print(type(myseries))
myseries

<class 'pandas.core.series.Series'>


0    1776
1    1867
2    1821
dtype: int64

In [48]:
myseries = pd.Series(data = mydata, index = myindex)
myseries

USA       1776
Canada    1867
Mexico    1821
dtype: int64

In [49]:
print(myseries[0])
print(myseries['USA'])

1776
1776


### Series from dictionary

In [50]:
ages = {'Sam':10, 'Fido':5, 'Spike':7}
ages

{'Sam': 10, 'Fido': 5, 'Spike': 7}

In [51]:
ages_series = pd.Series(ages)
ages_series

Sam      10
Fido      5
Spike     7
dtype: int64

In [52]:
# Mock sales data fro 1st and 2nd quarters for company
q1 = {'Japan': 80, 'China': 450, 'India': 200, 'USA': 250}
q2 = {'Brazil': 100, 'China': 500, 'India': 210, 'USA': 260}

In [53]:
sales_q1 = pd.Series(q1)
sales_q2 = pd.Series(q2)

In [54]:
sales_q1

Japan     80
China    450
India    200
USA      250
dtype: int64

In [55]:
sales_q2

Brazil    100
China     500
India     210
USA       260
dtype: int64

In [56]:
sales_q1.keys()

Index(['Japan', 'China', 'India', 'USA'], dtype='object')

In [57]:
sales_q1['USA'] + sales_q2['USA']

510

In [58]:
# Pandas addition operation with missing indices
totalq1q2_sales = sales_q1 + sales_q2
totalq1q2_sales

Brazil      NaN
China     950.0
India     410.0
Japan       NaN
USA       510.0
dtype: float64

In [59]:
# Pandas (numpy) add method
totalq1q2_sales = sales_q1.add(sales_q2, fill_value=0).astype('int32')
totalq1q2_sales

Brazil    100
China     950
India     410
Japan      80
USA       510
dtype: int32

### Pandas DataFrame:  A group of Pandas (Numpy) series objects taht share the same index.

In [60]:
# help(pd.DataFrame)

In [61]:
# Create data frame based on series
# Dummy Data
np.random.seed(101)
mydata = np.random.randint(0, 101, (4, 3), dtype='int16')
mydata

array([[95, 50, 11],
       [81, 23, 70],
       [99, 63, 74],
       [87,  4, 75]], dtype=int16)

In [62]:
myindex = ['CA', 'NY', 'AZ', 'TX']
myindex

['CA', 'NY', 'AZ', 'TX']

In [63]:
mycolumns = ['Jan', 'Feb', 'Mar']
mycolumns

['Jan', 'Feb', 'Mar']

In [64]:
# Basic dataframe without named index/columns
df = pd.DataFrame(data = mydata)
df

Unnamed: 0,0,1,2
0,95,50,11
1,81,23,70
2,99,63,74
3,87,4,75


In [65]:
# Named indices/columns
df = pd.DataFrame(data = mydata, index = myindex, columns = mycolumns)
df

Unnamed: 0,Jan,Feb,Mar
CA,95,50,11
NY,81,23,70
AZ,99,63,74
TX,87,4,75


In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, CA to TX
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Jan     4 non-null      int16
 1   Feb     4 non-null      int16
 2   Mar     4 non-null      int16
dtypes: int16(3)
memory usage: 56.0+ bytes
