# Chapter 5: Python Data Analysis with pandas

<div id="toc"></div>

## 5-1. Working with 1D Data

* Problem  
* Solution  
* How It Works  

In [1]:
import pandas as pd
data = [1,2,3,4]
data_pd = pd.Series(data)

In [None]:
>>> data_pd.dtype
dtype('int64')
>>> import numpy as np
>>> data_pd2 = pd.Series(data, dtype=np.float64)
>>> data_pd2.dtype
dtype('float64')

In [2]:
2 * data_pd

0    2
1    4
2    6
3    8
dtype: int64

In [3]:
data_pd[2]

3

In [4]:
data_pd[3] = 9

## 5-2. Working with 2D Data
* Problem  
* Solution  
* How It Works  

In [5]:
d1 = {'one' : [1,2,3,4], 'two' : [9,8,7,6]}
df1 = pd.DataFrame(d1)
df1

Unnamed: 0,one,two
0,1,9
1,2,8
2,3,7
3,4,6


In [6]:
df1['one']

0    1
1    2
2    3
3    4
Name: one, dtype: int64

In [7]:
df1['one'][2]

3

In [8]:
df1.loc[1]

one    2
two    8
Name: 1, dtype: int64

In [9]:
df1.loc[1][1]

8

## 5-3. Working with 3D Data
* Problem  
* Solution  
* How It Works  

In [11]:
import numpy as np

In [12]:
data_dict = {'item1' : pd.DataFrame(np.random.randn(4, 3)), 'item2' :pd.DataFrame(np.random.randn(4, 2))}
data_panel = pd.Panel(data_dict)

In [13]:
data_panel['item2']

Unnamed: 0,0,1,2
0,-1.161885,-0.537938,
1,-0.671292,2.432737,
2,0.244829,-0.633335,
3,0.170161,-0.637829,


## 5-4. Importing Data from CSV Files
* Problem  
* Solution  
* How It Works  

In [None]:
 csv_data = pd.read_csv('data_file.csv')

## 5-5. Saving to a CSV File
* Problem  
* Solution  
* How It Works  

In [None]:
series_data.to_csv('export_file.csv')

In [None]:
data2.to_csv('data_file.csv', header=False, index=False)

## 5-6. Importing from Spreadsheets
* Problem  
* Solution  
* How It Works  

In [None]:
data_frame1 = pd.read_excel('data_file.xsl', sheetname='Sheet1')

In [None]:
excel_data = pd.ExcelFile('data_file.xsl')

## 5-7. Saving to a Spreadsheet
* Problem  
* Solution  
* How It Works  

In [None]:
df.to_excel('output_file.xsl', sheet='Sheet1')

## 5-8. Getting the Head and Tail
* Problem  
* Solution  
* How It Works  

In [14]:
data_series = pd.Series(np.random.randn(1000))
data_series.head(2)

0   -0.271723
1   -1.097757
dtype: float64

In [15]:
data_series.tail(2)

998    1.242268
999    0.162675
dtype: float64

## 5-9. Summarizing Data
* Problem  
* Solution  
* How It Works  

In [16]:
data_series.describe()

count    1000.000000
mean       -0.010750
std         1.014484
min        -3.555131
25%        -0.680081
50%        -0.030919
75%         0.651722
max         2.912694
dtype: float64

In [17]:
data_series.std() ** 2

1.0291775507060255

In [18]:
data_series.var()

1.0291775507060257

## 5-10. Sorting Data
* Problem  
* Solution  
* How It Works  

In [19]:
df = pd.DataFrame({'one' : [1,2,3], 'two' : [4,5,6], 'three' : [7,8,9]}, index=['b','c','a'])

In [20]:
df

Unnamed: 0,one,three,two
b,1,7,4
c,2,8,5
a,3,9,6


In [21]:
df.sort_index()

Unnamed: 0,one,three,two
a,3,9,6
b,1,7,4
c,2,8,5


In [22]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,two,three,one
b,4,7,1
c,5,8,2
a,6,9,3


In [23]:
df.sort_values(by='two', ascending=False)

Unnamed: 0,one,three,two
a,3,9,6
c,2,8,5
b,1,7,4


## 5-11. Applying Functions Row- or Column-Wise
* Problem  
* Solution  
* How It Works  

In [24]:
df = pd.DataFrame({'one' : [1,2,3], 'two' : [4,5,6], 'three' : [7,8,9]}, index=['b','c','a'])

In [25]:
df.apply(np.mean)

one      2.0
three    8.0
two      5.0
dtype: float64

In [26]:
df.apply(np.mean, axis=1)

b    4.0
c    5.0
a    6.0
dtype: float64

In [27]:
df.apply(lambda x: 2*x, axis=1)

Unnamed: 0,one,three,two
b,2,14,8
c,4,16,10
a,6,18,12


## 5-12. Applying Functions Element-Wise
* Problem  
* Solution  
* How It Works  

In [28]:
df = pd.DataFrame({'one' : [1,2,3], 'two' : [4,5,6], 'three' : [7,8,9]}, index=['b','c','a'])

In [29]:
df.applymap(lambda x: x*x)

Unnamed: 0,one,three,two
b,1,49,16
c,4,64,25
a,9,81,36


In [30]:
df['two'].map(lambda x: 2*x)

b     8
c    10
a    12
Name: two, dtype: int64

## 5-13. Iterating Over Data
* Problem  
* Solution  
* How It Works  

In [31]:
df = pd.DataFrame({'one' : [1,2,3], 'two' : [4,5,6], 'three' : [7,8,9]}, index=['b','c','a'])
for col in df:
    print(df[col].mean())

2.0
8.0
5.0


In [None]:
for row_index,row in df.iterrows():
    print(row_index)
    print(row)