# Set up NoteBook

## Load libraries

In [1]:
import pandas as pd
import os

## Set up Jupyter NoteBook output

In [2]:
# To output more than one output from one cell without print()
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = 'all'

# Set up formatting so larger numbers aren't displayed in scientific notation (h/t @thecapacity)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

## Set up working directories

In [3]:
# Path to project
path_project = 'D:\Study\DataCamp'
# Path to data folder in project directory
path_data = 'data'
# Path to working directory
path_wd = os.path.join(path_project, path_data)

# Set working directory
os.chdir(path_wd) # path_project, path_wd

# Ouput working directory
print('Working directory is: ', os.getcwd())

Working directory is:  D:\Study\DataCamp\data


# General Findings

In [None]:
# Indexing using square brackets
df['salt']['Jan'] # Choose Columns first, then Rows -> df['Jan']['salt'] - WONT WORK

# Using the .loc accessor -  BEST WAY TO SUBSELECT
df.loc['May', 'spam'] # Row first, Column second

# Subselecting returned DataTypes
df.loc['May', 'spam']       # Individual value - numpy.int64
df.loc[:, 'spam']           # Series
df.loc[:, ['salt', 'spam']] # DataFrame

df['spam'] ->  Series
df[['spame']] -> DataFrame

# Using the .iloc accessor
df.iloc[4, 2] 
    # Row first, Column second; 
    # Uses 0 based indexing
    
# Slice the row labels 'Potter' to 'Perry' in reverse order: p_counties_rev
p_counties_rev = election.loc['Potter':'Perry':-1, :] # DF have ordered rows alphabeticaly

# Select columns with all nonzeros - ZERO in this case is 0 not NaN !!!
df2.loc[:, df2.all()]

# Select columns without NaNs
df.loc[:, df.notnull().all()] # if atleast one NaN then column not included, 0 != NaN for this case, so 0 stays!!!

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
titanic.dropna(thresh=1000, axis='columns')
    # Drops columns where more than 1000 NaN values
    
# Best way to transform data in pandas DataFrames is with methods inherent to the DataFrames
# Next best using NymPy ufuncs - universal funcs, to transform entire columns of data elementwise

# Working with string values
df.index = df.index.str.upper()
    # index comes with .str - that is accessor for vectorized string operations

# Working with string values
df.index = df.index.map(str.lower)
    # For the index there is no .apply() method so need to use map
    # The .map() method is used to transform values according to a Python dictionary look-up.
    
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama': 'blue', 'Romney': 'red'}
# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)
    # In column - 'color' -> if winner - Obama then blue, etc

# Pandas Data Structures:
    # Indexes: Sequence of labels
    # Series: 1D array with Index
    # DataFrames: 2D array with Series as columns

# Indexes   - Immutable (like dictionary keys)
    #       - Homogenoues in data type(Like NumPy arrays)
    
# Setting MultiIndex
stocks = stocks.set_index(['Symbol', 'Date'])
    # .set_index - sets index in place, and no need to delete column

# To access index.name for MultiIndex - use .names not .name
print(stocks.index.names)
    
# Indexing (individual row)
stocks.loc[('CSCO', '2016-10-04'), 'Volume']
    # To access particular value in DataFrame with MultiIndexing - use () instead of []

# Slicing (both indexes) - IMPORTANT part is slice(None)
stocks.loc[(slice(None), slice('2016-10-03', '2016-10-04')), :]    

# Fancy indexing (innermost and outermost index at same time)
stocks.loc[(['CSCO', 'AAPL'], ['2016-10-05', '2016-10-03']), :]


    

# Extracting and transforming data

## Indexing DataFrames

When subselecting from DataFrame or List, result might be: 
    Individual value
    Series
    DataFrame

In [5]:
# A simple DataFrame
df = pd.read_csv('sales.csv', index_col='month')

df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [12]:
# Indexing using square brackets
df['salt']['Jan'] # Choose Columns first, then Rows -> df['Jan']['salt'] - WONT WORK

12.0

In [13]:
# Using column attribute and row label
df.eggs['Mar']

221

In [14]:
# Using the .loc accessor
df.loc['May', 'spam'] # Row first, Column second

52

In [15]:
# Using the .iloc accessor
df.iloc[4, 2] 
    # Row first, Column second; 
    # Uses 0 based indexing

52

In [16]:
# Selecting only some columns
df_new = df[['salt', 'eggs']]

df_new

Unnamed: 0_level_0,salt,eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,12.0,47
Feb,50.0,110
Mar,89.0,221
Apr,87.0,77
May,,132
Jun,60.0,205


In [23]:
election = pd.read_csv('pennsylvania2012_turnout.csv', index_col='county')

election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [24]:
# Assign the row position of election.loc['Bedford']: x
x = 4

# Assign the column position of election['winner']: y
y = 4

# Print the boolean equivalence
print(election.iloc[x, y] == election.loc['Bedford', 'winner'])

True


In [26]:
filename = 'https://s3.amazonaws.com/assets.datacamp.com/production/course_1650/datasets/pennsylvania2012.csv'

In [27]:
# Import pandas
import pandas as pd

# Read in filename and set the index: election
election = pd.read_csv(filename, index_col='county')

# Create a separate dataframe with the columns ['winner', 'total', 'voters']: results
results = election[['winner', 'total', 'voters']]

# Print the output of results.head()
print(results.head())

           winner   total  voters
county                           
Adams      Romney   41973   61156
Allegheny   Obama  614671  924351
Armstrong  Romney   28322   42147
Beaver     Romney   80015  115157
Bedford    Romney   21444   32189


## Slicing DataFrames

In [28]:
df = pd.read_csv('sales.csv', index_col='month')

df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [30]:
# Selecting a column (i.e., Series)
df['eggs']

type(df['eggs'])

month
Jan     47
Feb    110
Mar    221
Apr     77
May    132
Jun    205
Name: eggs, dtype: int64

pandas.core.series.Series

In [31]:
# Slicing and indexing a Series
df['eggs'][1:4] # Part of the eggs column

df['eggs'][4] # The value associated with May

month
Feb    110
Mar    221
Apr     77
Name: eggs, dtype: int64

132

In [32]:
# Using .loc[]
df.loc[:, 'eggs':'salt'] # All rows some columns

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
Feb,110,50.0
Mar,221,89.0
Apr,77,87.0
May,132,
Jun,205,60.0


In [33]:
# Using .loc[]
df.loc['Jan':'Apr', :] # Some rows, all columns

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20


In [34]:
# Using .loc[]
df.loc['Mar':'May', 'salt':'spam']

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [35]:
# Using .iloc[]
df.iloc[2:5, 1:] # A block from middleof the DataFrame

Unnamed: 0_level_0,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Mar,89.0,72
Apr,87.0,20
May,,52


In [36]:
# Using lists rather than slices
df.loc['Jan':'May', ['eggs', 'spam']]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52


In [37]:
# Using lists rather than slices
df.iloc[[0, 4, 5], 0:2]

Unnamed: 0_level_0,eggs,salt
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,12.0
May,132,
Jun,205,60.0


In [40]:
# Series
type(df['eggs'])

# DataFrame
type(df[['eggs']])

pandas.core.series.Series

pandas.core.frame.DataFrame

In [41]:
election = pd.read_csv('pennsylvania2012_turnout.csv', index_col='county')

election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [42]:
# Slice the row labels 'Perry' to 'Potter': p_counties
p_counties = election.loc['Perry':'Potter', :]

# Print the p_counties DataFrame
print(p_counties)

# Slice the row labels 'Potter' to 'Perry' in reverse order: p_counties_rev
p_counties_rev = election.loc['Potter':'Perry':-1, :]

# Print the p_counties_rev DataFrame
print(p_counties_rev)

             state   total  Obama  Romney  winner   voters  turnout  margin
county                                                                     
Perry           PA   18240 29.770  68.591  Romney    27245   66.948  38.821
Philadelphia    PA  653598 85.224  14.051   Obama  1099197   59.461  71.173
Pike            PA   23164 43.904  54.883  Romney    41840   55.363  10.978
Potter          PA    7205 26.260  72.158  Romney    10913   66.022  45.899
             state   total  Obama  Romney  winner   voters  turnout  margin
county                                                                     
Potter          PA    7205 26.260  72.158  Romney    10913   66.022  45.899
Pike            PA   23164 43.904  54.883  Romney    41840   55.363  10.978
Philadelphia    PA  653598 85.224  14.051   Obama  1099197   59.461  71.173
Perry           PA   18240 29.770  68.591  Romney    27245   66.948  38.821


In [43]:
# Slice the columns from the starting column to 'Obama': left_columns
left_columns = election.loc[:, :'Obama']

# Print the output of left_columns.head()
print(left_columns.head())

# Slice the columns from 'Obama' to 'winner': middle_columns
middle_columns = election.loc[:, 'Obama':'winner']

# Print the output of middle_columns.head()
print(middle_columns.head())

# Slice the columns from 'Romney' to the end: 'right_columns'
right_columns = election.loc[:, 'Romney':]

# Print the output of right_columns.head()
print(right_columns.head())

          state   total  Obama
county                        
Adams        PA   41973 35.482
Allegheny    PA  614671 56.640
Armstrong    PA   28322 30.697
Beaver       PA   80015 46.033
Bedford      PA   21444 22.057
           Obama  Romney  winner
county                          
Adams     35.482  63.112  Romney
Allegheny 56.640  42.186   Obama
Armstrong 30.697  67.901  Romney
Beaver    46.033  52.638  Romney
Bedford   22.057  76.987  Romney
           Romney  winner  voters  turnout  margin
county                                            
Adams      63.112  Romney   61156   68.633  27.630
Allegheny  42.186   Obama  924351   66.498  14.454
Armstrong  67.901  Romney   42147   67.198  37.204
Beaver     52.638  Romney  115157   69.483   6.605
Bedford    76.987  Romney   32189   66.619  54.929


In [44]:
# Create the list of row labels: rows
rows = ['Philadelphia', 'Centre', 'Fulton']

# Create the list of column labels: cols
cols = ['winner', 'Obama', 'Romney']

# Create the new DataFrame: three_counties
three_counties = election.loc[rows, cols]

# Print the three_counties DataFrame
print(three_counties)

              winner  Obama  Romney
county                             
Philadelphia   Obama 85.224  14.051
Centre        Romney 48.948  48.977
Fulton        Romney 21.096  77.749


## Filtering DataFrames

In [45]:
df = pd.read_csv('sales.csv', index_col='month')

df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [46]:
df.salt > 60

month
Jan    False
Feb    False
Mar     True
Apr     True
May    False
Jun    False
Name: salt, dtype: bool

In [47]:
# Filtering with Boolean Series
df[df.salt > 0]

enough_salt_sold = df.salt > 0

df[enough_salt_sold]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
Jun,205,60.0,55


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
Jun,205,60.0,55


In [48]:
# Combining filters
df[(df.salt >= 50) & (df.eggs < 200)] # Operattor & - And -> both conditions should be True

df[(df.salt >= 50) | (df.eggs < 200)] # Operator | - Or -> one condition should be True

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Feb,110,50.0,31
Apr,77,87.0,20


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [50]:
# DataFrames with Zeros and NaNs
df2 = df.copy()

df2['bacon'] = [0, 0, 50, 60, 70, 80]

df2

Unnamed: 0_level_0,eggs,salt,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,0
Feb,110,50.0,31,0
Mar,221,89.0,72,50
Apr,77,87.0,20,60
May,132,,52,70
Jun,205,60.0,55,80


In [51]:
# Select columns with all nonzeros
df2.loc[:, df2.all()]

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [52]:
# Select columns with any nonzeros
df2.loc[:, df2.any()]

Unnamed: 0_level_0,eggs,salt,spam,bacon
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,0
Feb,110,50.0,31,0
Mar,221,89.0,72,50
Apr,77,87.0,20,60
May,132,,52,70
Jun,205,60.0,55,80


In [53]:
# Select columns with any NaNs
df.loc[:, df.isnull().any()]

Unnamed: 0_level_0,salt
month,Unnamed: 1_level_1
Jan,12.0
Feb,50.0
Mar,89.0
Apr,87.0
May,
Jun,60.0


In [55]:
# Select columns without NaNs
df.loc[:, df.notnull().all()]

Unnamed: 0_level_0,eggs,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,47,17
Feb,110,31
Mar,221,72
Apr,77,20
May,132,52
Jun,205,55


In [56]:
# Drop rows with any NaNs
df.dropna(how='any')

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
Jun,205,60.0,55


In [57]:
# Filtering a column based on another
df.eggs[df.salt > 55]

month
Mar    221
Apr     77
Jun    205
Name: eggs, dtype: int64

In [58]:
# Modyfying column based on another
df.eggs[df.salt > 55] += 5

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,226,89.0,72
Apr,82,87.0,20
May,132,,52
Jun,210,60.0,55


In [61]:
election = pd.read_csv('pennsylvania2012_turnout.csv', index_col='county')

election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [62]:
# Create the boolean array: high_turnout
high_turnout = election['turnout'] > 70

# Filter the election DataFrame with the high_turnout array: high_turnout_df
high_turnout_df = election.loc[high_turnout, :]

# Print the high_turnout_results DataFrame
print(high_turnout_df)

             state   total  Obama  Romney  winner  voters  turnout  margin
county                                                                    
Bucks           PA  319407 49.967  48.802   Obama  435606   73.325   1.165
Butler          PA   88924 31.921  66.817  Romney  122762   72.436  34.896
Chester         PA  248295 49.229  49.651  Romney  337822   73.499   0.422
Forest          PA    2308 38.735  59.835  Romney    3232   71.411  21.101
Franklin        PA   62802 30.111  68.584  Romney   87406   71.851  38.473
Montgomery      PA  401787 56.637  42.287   Obama  551105   72.906  14.350
Westmoreland    PA  168709 37.568  61.306  Romney  238006   70.884  23.739


In [63]:
# Import numpy
import numpy as np

# Create the boolean array: too_close
too_close = election['margin'] < 1

# Assign np.nan to the 'winner' column where the results were too close to call
election.loc[too_close, 'winner'] = np.nan

# Print the output of election.info()
print(election.info())

<class 'pandas.core.frame.DataFrame'>
Index: 67 entries, Adams to York
Data columns (total 8 columns):
state      67 non-null object
total      67 non-null int64
Obama      67 non-null float64
Romney     67 non-null float64
winner     64 non-null object
voters     67 non-null int64
turnout    67 non-null float64
margin     67 non-null float64
dtypes: float64(4), int64(2), object(2)
memory usage: 4.7+ KB
None


In [64]:
titanic = pd.read_csv('titanic.csv')

In [65]:
# Select the 'age' and 'cabin' columns: df
df = titanic[['age', 'cabin']]

# Print the shape of df
print(df.shape)

# Drop rows in df with how='any' and print the shape
print(df.dropna(how='any').shape)

# Drop rows in df with how='all' and print the shape
print(df.dropna(how='all').shape)

# Call .dropna() with thresh=1000 and axis='columns' and print the output of .info() from titanic
print(titanic.dropna(thresh=1000, axis='columns').info())
    # Drops columns where more than 1000 NaN values

(1309, 2)
(272, 2)
(1069, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 10 columns):
pclass      1309 non-null int64
survived    1309 non-null int64
name        1309 non-null object
sex         1309 non-null object
age         1046 non-null float64
sibsp       1309 non-null int64
parch       1309 non-null int64
ticket      1309 non-null object
fare        1308 non-null float64
embarked    1307 non-null object
dtypes: float64(2), int64(4), object(4)
memory usage: 102.3+ KB
None


## Transforming DataFrames

In [68]:
df = pd.read_csv('sales.csv', index_col='month')

df

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52
Jun,205,60.0,55


In [69]:
# DataFrame vectorized methods
df.floordiv(12) # Convert to dozens unit, rounded down

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [70]:
# NumPy vectorized functions
np.floor_divide(df, 12) # Convert to dozens unit

  


Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3.0,1.0,1.0
Feb,9.0,4.0,2.0
Mar,18.0,7.0,6.0
Apr,6.0,7.0,1.0
May,11.0,,4.0
Jun,17.0,5.0,4.0


In [72]:
# Plain Python functions
def dozens(n):
    return n // 12

df.apply(dozens) # Convert to dozens unit

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [74]:
# Plain Python functions -  lambda
df.apply(lambda n: n // 12)
    # n -  input argument;
    # : - after that -> output expresion

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,3,1.0,1
Feb,9,4.0,2
Mar,18,7.0,6
Apr,6,7.0,1
May,11,,4
Jun,17,5.0,4


In [75]:
# Storing a transformation
df['dozens_of_eggs'] = df.eggs.floordiv(12)

df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Jan,47,12.0,17,3
Feb,110,50.0,31,9
Mar,221,89.0,72,18
Apr,77,87.0,20,6
May,132,,52,11
Jun,205,60.0,55,17


In [76]:
# The DataFrame index
df.index

Index(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'], dtype='object', name='month')

In [77]:
# Working with string values
df.index = df.index.str.upper()
    # index comes with .str - that is accessor for vectorized string operations

df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
JAN,47,12.0,17,3
FEB,110,50.0,31,9
MAR,221,89.0,72,18
APR,77,87.0,20,6
MAY,132,,52,11
JUN,205,60.0,55,17


In [79]:
# Working with string values
df.index = df.index.map(str.lower)
    # For the index there is no .apply() method so need to use map
    
df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
jan,47,12.0,17,3
feb,110,50.0,31,9
mar,221,89.0,72,18
apr,77,87.0,20,6
may,132,,52,11
jun,205,60.0,55,17


In [84]:
# Defining columns using other columns
df['salty_eggs'] = df.salt + df.dozens_of_eggs

df

Unnamed: 0_level_0,eggs,salt,spam,dozens_of_eggs,salty_eggs
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
jan,47,12.0,17,3,15.0
feb,110,50.0,31,9,59.0
mar,221,89.0,72,18,107.0
apr,77,87.0,20,6,93.0
may,132,,52,11,
jun,205,60.0,55,17,77.0


In [85]:
weather = pd.read_csv('pittsburgh2013.csv')

weather.head()

Unnamed: 0,Date,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,Mean Dew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,...,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,2013-1-1,32,28,21,30,27,16,100,89,77,...,10,6,2,10,8,,0.0,8,Snow,277
1,2013-1-2,25,21,17,14,12,10,77,67,55,...,10,10,10,14,5,,0.0,4,,272
2,2013-1-3,32,24,16,19,15,9,77,67,56,...,10,10,10,17,8,26.0,0.0,3,,229
3,2013-1-4,30,28,27,21,19,17,75,68,59,...,10,10,6,23,16,32.0,0.0,4,,250
4,2013-1-5,34,30,25,23,20,16,75,68,61,...,10,10,10,16,10,23.0,0.21,5,,221


In [86]:
# Write a function to convert degrees Fahrenheit to degrees Celsius: to_celsius
def to_celsius(F):
    return 5/9*(F - 32)

# Apply the function over 'Mean TemperatureF' and 'Mean Dew PointF': df_celsius
df_celsius = weather[['Mean TemperatureF', 'Mean Dew PointF']].apply(to_celsius)

# Reassign the columns df_celsius
df_celsius.columns = ['Mean TemperatureC', 'Mean Dew PointC']

# Print the output of df_celsius.head()
print(df_celsius.head())

   Mean TemperatureC  Mean Dew PointC
0             -2.222           -2.778
1             -6.111          -11.111
2             -4.444           -9.444
3             -2.222           -7.222
4             -1.111           -6.667


In [88]:
election = pd.read_csv('pennsylvania2012_turnout.csv', index_col='county')

election.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [89]:
# Create the dictionary: red_vs_blue
red_vs_blue = {'Obama': 'blue', 'Romney': 'red'}

# Use the dictionary to map the 'winner' column to the new column: election['color']
election['color'] = election['winner'].map(red_vs_blue)

# Print the output of election.head()
print(election.head())

          state   total  Obama  Romney  winner  voters  turnout  margin color
county                                                                       
Adams        PA   41973 35.482  63.112  Romney   61156   68.633  27.630   red
Allegheny    PA  614671 56.640  42.186   Obama  924351   66.498  14.454  blue
Armstrong    PA   28322 30.697  67.901  Romney   42147   67.198  37.204   red
Beaver       PA   80015 46.033  52.638  Romney  115157   69.483   6.605   red
Bedford      PA   21444 22.057  76.987  Romney   32189   66.619  54.929   red


In [90]:
# Import zscore from scipy.stats
from scipy.stats import zscore

# Call zscore with election['turnout'] as input: turnout_zscore
turnout_zscore = zscore(election['turnout'])

# Print the type of turnout_zscore
print(type(turnout_zscore))

# Assign turnout_zscore to a new column: election['turnout_zscore']
election['turnout_zscore'] = turnout_zscore

# Print the output of election.head()
print(election.head())

<class 'numpy.ndarray'>
          state   total  Obama  Romney  winner  voters  turnout  margin color  \
county                                                                          
Adams        PA   41973 35.482  63.112  Romney   61156   68.633  27.630   red   
Allegheny    PA  614671 56.640  42.186   Obama  924351   66.498  14.454  blue   
Armstrong    PA   28322 30.697  67.901  Romney   42147   67.198  37.204   red   
Beaver       PA   80015 46.033  52.638  Romney  115157   69.483   6.605   red   
Bedford      PA   21444 22.057  76.987  Romney   32189   66.619  54.929   red   

           turnout_zscore  
county                     
Adams               0.854  
Allegheny           0.440  
Armstrong           0.576  
Beaver              1.019  
Bedford             0.463  


# Advanced indexing

## Index objects and labeled data

In [91]:
# Creating a Series
prices = [10.70, 10.86, 10.74, 10.71, 10.79]
shares = pd.Series(prices)

shares

0   10.700
1   10.860
2   10.740
3   10.710
4   10.790
dtype: float64

In [92]:
# Creating an index
days = ['Mon', 'Tue', 'Wed', 'Thur', 'Fri']
shares = pd.Series(prices, index=days)

shares

Mon    10.700
Tue    10.860
Wed    10.740
Thur   10.710
Fri    10.790
dtype: float64

In [95]:
# Examining an index
shares.index

shares.index[2]

shares.index[:2]

shares.index[-2:]

print(shares.index.name)

Index(['Mon', 'Tue', 'Wed', 'Thur', 'Fri'], dtype='object')

'Wed'

Index(['Mon', 'Tue'], dtype='object')

Index(['Thur', 'Fri'], dtype='object')

None


In [96]:
# Modifying index name
shares.index.name = 'weekday'

shares

weekday
Mon    10.700
Tue    10.860
Wed    10.740
Thur   10.710
Fri    10.790
dtype: float64

In [99]:
# Modifying index entries
# WONT WORK CODE BELLOW
#shares.index[2] = 'Wednesday'

shares.index = ['Monday', 'Tuesday', 'Wednaesday', 'Thursday', 'Friday']
    # index could be changed only all at once
shares

Monday       10.700
Tuesday      10.860
Wednaesday   10.740
Thursday     10.710
Friday       10.790
dtype: float64

In [101]:
# There were NO unemployment dataset so will use - pennsylvania2012_turnout
df = pd.read_csv('pennsylvania2012_turnout.csv')

df.head()

Unnamed: 0,county,state,total,Obama,Romney,winner,voters,turnout,margin
0,Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
1,Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
2,Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
3,Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
4,Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [102]:
# Assigning the index
df.index = df['county']

df.head()

Unnamed: 0_level_0,county,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Adams,Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [103]:
# Removing extra column
del df['county']

df.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [104]:
# Examining index & columns
df.info()

df.index

df.index.name

type(df.index)

df.columns

<class 'pandas.core.frame.DataFrame'>
Index: 67 entries, Adams to York
Data columns (total 8 columns):
state      67 non-null object
total      67 non-null int64
Obama      67 non-null float64
Romney     67 non-null float64
winner     67 non-null object
voters     67 non-null int64
turnout    67 non-null float64
margin     67 non-null float64
dtypes: float64(4), int64(2), object(2)
memory usage: 4.7+ KB


Index(['Adams', 'Allegheny', 'Armstrong', 'Beaver', 'Bedford', 'Berks',
       'Blair', 'Bradford', 'Bucks', 'Butler', 'Cambria', 'Cameron', 'Carbon',
       'Centre', 'Chester', 'Clarion', 'Clearfield', 'Clinton', 'Columbia',
       'Crawford', 'Cumberland', 'Dauphin', 'Delaware', 'Elk', 'Erie',
       'Fayette', 'Forest', 'Franklin', 'Fulton', 'Greene', 'Huntingdon',
       'Indiana', 'Jefferson', 'Juniata', 'Lackawanna', 'Lancaster',
       'Lawrence', 'Lebanon', 'Lehigh', 'Luzerne', 'Lycoming', 'McKean',
       'Mercer', 'Mifflin', 'Monroe', 'Montgomery', 'Montour', 'Northampton',
       'Northumberland', 'Perry', 'Philadelphia', 'Pike', 'Potter',
       'Schuylkill', 'Snyder', 'Somerset', 'Sullivan', 'Susquehanna', 'Tioga',
       'Union', 'Venango', 'Warren', 'Washington', 'Wayne', 'Westmoreland',
       'Wyoming', 'York'],
      dtype='object', name='county')

'county'

pandas.core.indexes.base.Index

Index(['state', 'total', 'Obama', 'Romney', 'winner', 'voters', 'turnout',
       'margin'],
      dtype='object')

In [106]:
# pd.read_csv() with index_col
df = pd.read_csv('pennsylvania2012_turnout.csv', index_col='county')

df.head()

Unnamed: 0_level_0,state,total,Obama,Romney,winner,voters,turnout,margin
county,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Adams,PA,41973,35.482,63.112,Romney,61156,68.633,27.63
Allegheny,PA,614671,56.64,42.186,Obama,924351,66.498,14.454
Armstrong,PA,28322,30.697,67.901,Romney,42147,67.198,37.204
Beaver,PA,80015,46.033,52.638,Romney,115157,69.483,6.605
Bedford,PA,21444,22.057,76.987,Romney,32189,66.619,54.929


In [107]:
# A simple DataFrame
sales = pd.read_csv('sales.csv', index_col='month')

sales.head()

Unnamed: 0_level_0,eggs,salt,spam
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Jan,47,12.0,17
Feb,110,50.0,31
Mar,221,89.0,72
Apr,77,87.0,20
May,132,,52


In [108]:
# Create the list of new indexes: new_idx
new_idx = [i.upper() for i in sales.index]

# Assign new_idx to sales.index
sales.index = new_idx

# Print the sales DataFrame
print(sales)

     eggs   salt  spam
JAN    47 12.000    17
FEB   110 50.000    31
MAR   221 89.000    72
APR    77 87.000    20
MAY   132    nan    52
JUN   205 60.000    55


In [109]:
# Assign the string 'MONTHS' to sales.index.name
sales.index.name = 'MONTHS'

# Print the sales DataFrame
print(sales)

# Assign the string 'PRODUCTS' to sales.columns.name 
sales.columns.name = 'PRODUCTS'

# Print the sales dataframe again
print(sales)

        eggs   salt  spam
MONTHS                   
JAN       47 12.000    17
FEB      110 50.000    31
MAR      221 89.000    72
APR       77 87.000    20
MAY      132    nan    52
JUN      205 60.000    55
PRODUCTS  eggs   salt  spam
MONTHS                     
JAN         47 12.000    17
FEB        110 50.000    31
MAR        221 89.000    72
APR         77 87.000    20
MAY        132    nan    52
JUN        205 60.000    55


In [110]:
# A simple DataFrame
sales = pd.read_csv('sales.csv')

sales.head()

Unnamed: 0,month,eggs,salt,spam
0,Jan,47,12.0,17
1,Feb,110,50.0,31
2,Mar,221,89.0,72
3,Apr,77,87.0,20
4,May,132,,52


In [111]:
# Generate the list of months: months
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']

# Assign months to sales.index
sales.index = months

# Print the modified sales DataFrame
print(sales)

    month  eggs   salt  spam
Jan   Jan    47 12.000    17
Feb   Feb   110 50.000    31
Mar   Mar   221 89.000    72
Apr   Apr    77 87.000    20
May   May   132    nan    52
Jun   Jun   205 60.000    55


## Hierarchical indexing

In [134]:
stocks = pd.read_csv('stocks_3days.csv')

stocks.head()

Unnamed: 0,Date,Close,Volume,Symbol
0,2016-10-03,31.5,14070500,CSCO
1,2016-10-03,112.52,21701800,AAPL
2,2016-10-03,57.42,19189500,MSFT
3,2016-10-04,113.0,29736800,AAPL
4,2016-10-04,57.24,20085900,MSFT


In [135]:
# Setting index
stocks = stocks.set_index(['Symbol', 'Date'])
    # .set_index - sets index in place, and no need to delete column
    
    # Code bellow wont create multi index, so use.set_index
    # stocks.index = stocks[['Symbol', 'Date']]
    # stocks.drop(['Symbol', 'Date'], axis=1) 
stocks

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
AAPL,2016-10-03,112.52,21701800
MSFT,2016-10-03,57.42,19189500
AAPL,2016-10-04,113.0,29736800
MSFT,2016-10-04,57.24,20085900
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-05,57.64,16726400
CSCO,2016-10-05,31.59,11808600
AAPL,2016-10-05,113.05,21453100


In [136]:
stocks.index

print(stocks.index.name)

print(stocks.index.names)

MultiIndex(levels=[['AAPL', 'CSCO', 'MSFT'], ['2016-10-03', '2016-10-04', '2016-10-05']],
           labels=[[1, 0, 2, 0, 2, 1, 2, 1, 0], [0, 0, 0, 1, 1, 1, 2, 2, 2]],
           names=['Symbol', 'Date'])

None
['Symbol', 'Date']


In [137]:
# Sorting index
stocks = stocks.sort_index()

stocks

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,2016-10-03,112.52,21701800
AAPL,2016-10-04,113.0,29736800
AAPL,2016-10-05,113.05,21453100
CSCO,2016-10-03,31.5,14070500
CSCO,2016-10-04,31.35,18460400
CSCO,2016-10-05,31.59,11808600
MSFT,2016-10-03,57.42,19189500
MSFT,2016-10-04,57.24,20085900
MSFT,2016-10-05,57.64,16726400


In [138]:
# Indexing (individual row)
stocks.loc[('CSCO', '2016-10-04')]

stocks.loc[('CSCO', '2016-10-04'), 'Volume']

Close          31.350
Volume   18460400.000
Name: (CSCO, 2016-10-04), dtype: float64

18460400.0

In [139]:
# Slicing (outermost index) with MultiIndex
stocks.loc['AAPL']

Unnamed: 0_level_0,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2016-10-03,112.52,21701800
2016-10-04,113.0,29736800
2016-10-05,113.05,21453100


In [140]:
# Slicing (outermost index) with MultiIndex
stocks.loc['CSCO':'MSFT']

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
CSCO,2016-10-04,31.35,18460400
CSCO,2016-10-05,31.59,11808600
MSFT,2016-10-03,57.42,19189500
MSFT,2016-10-04,57.24,20085900
MSFT,2016-10-05,57.64,16726400


In [141]:
# Fancy indexing (outermost index) with MultiIndex
stocks.loc[(['AAPL', 'MSFT'], '2016-10-05'), :]

stocks.loc[(['AAPL', 'MSFT'], '2016-10-05'), 'Close']

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,2016-10-05,113.05,21453100
MSFT,2016-10-05,57.64,16726400


Symbol  Date      
AAPL    2016-10-05   113.050
MSFT    2016-10-05    57.640
Name: Close, dtype: float64

In [142]:
# Fancy indexing (innermost index)
stocks.loc[('CSCO', ['2016-10-05', '2016-10-03']), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
CSCO,2016-10-03,31.5,14070500
CSCO,2016-10-05,31.59,11808600


In [143]:
# Slicing (both indexes)
stocks.loc[(slice(None), slice('2016-10-03', '2016-10-04')), :]

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
AAPL,2016-10-03,112.52,21701800
AAPL,2016-10-04,113.0,29736800
CSCO,2016-10-03,31.5,14070500
CSCO,2016-10-04,31.35,18460400
MSFT,2016-10-03,57.42,19189500
MSFT,2016-10-04,57.24,20085900


In [155]:
# Data not available in needed fromat for exercise will do what is neede to have data
sales = pd.read_csv('sales.csv')

sales.drop('month', axis=1, inplace=True)
sales['month'] = [1, 2, 1, 2, 1, 2]
sales['state'] = ['CA', 'CA', 'NY', 'NY', 'TX', 'TX']

sales.set_index(['state', 'month'], inplace=True)
sales

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
CA,2,110,50.0,31
NY,1,221,89.0,72
NY,2,77,87.0,20
TX,1,132,,52
TX,2,205,60.0,55


In [156]:
# Print sales.loc[['CA', 'TX']]
print(sales.loc[['CA', 'TX']])

# Print sales['CA':'TX']
print(sales['CA':'TX'])

             eggs   salt  spam
state month                   
CA    1        47 12.000    17
      2       110 50.000    31
TX    1       132    nan    52
      2       205 60.000    55
             eggs   salt  spam
state month                   
CA    1        47 12.000    17
      2       110 50.000    31
NY    1       221 89.000    72
      2        77 87.000    20
TX    1       132    nan    52
      2       205 60.000    55


In [158]:
# Data not available in needed fromat for exercise will do what is neede to have data
sales = pd.read_csv('sales.csv')

sales.drop('month', axis=1, inplace=True)
sales['month'] = [1, 2, 1, 2, 1, 2]
sales['state'] = ['CA', 'CA', 'NY', 'NY', 'TX', 'TX']

sales

Unnamed: 0,eggs,salt,spam,month,state
0,47,12.0,17,1,CA
1,110,50.0,31,2,CA
2,221,89.0,72,1,NY
3,77,87.0,20,2,NY
4,132,,52,1,TX
5,205,60.0,55,2,TX


In [159]:
# Set the index to be the columns ['state', 'month']: sales
sales = sales.set_index(['state', 'month'])

# Sort the MultiIndex: sales
sales = sales.sort_index()

# Print the sales DataFrame
print(sales)

             eggs   salt  spam
state month                   
CA    1        47 12.000    17
      2       110 50.000    31
NY    1       221 89.000    72
      2        77 87.000    20
TX    1       132    nan    52
      2       205 60.000    55


In [160]:
# Data not available in needed fromat for exercise will do what is neede to have data
sales = pd.read_csv('sales.csv')

sales.drop('month', axis=1, inplace=True)
sales['month'] = [1, 2, 1, 2, 1, 2]
sales['state'] = ['CA', 'CA', 'NY', 'NY', 'TX', 'TX']

sales

Unnamed: 0,eggs,salt,spam,month,state
0,47,12.0,17,1,CA
1,110,50.0,31,2,CA
2,221,89.0,72,1,NY
3,77,87.0,20,2,NY
4,132,,52,1,TX
5,205,60.0,55,2,TX


In [161]:
# Set the index to the column 'state': sales
sales = sales.set_index('state')

# Print the sales DataFrame
print(sales)

# Access the data from 'NY'
print(sales.loc['NY', :])

       eggs   salt  spam  month
state                          
CA       47 12.000    17      1
CA      110 50.000    31      2
NY      221 89.000    72      1
NY       77 87.000    20      2
TX      132    nan    52      1
TX      205 60.000    55      2
       eggs   salt  spam  month
state                          
NY      221 89.000    72      1
NY       77 87.000    20      2


In [162]:
# Data not available in needed fromat for exercise will do what is neede to have data
sales = pd.read_csv('sales.csv')

sales.drop('month', axis=1, inplace=True)
sales['month'] = [1, 2, 1, 2, 1, 2]
sales['state'] = ['CA', 'CA', 'NY', 'NY', 'TX', 'TX']

sales.set_index(['state', 'month'], inplace=True)
sales

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,1,47,12.0,17
CA,2,110,50.0,31
NY,1,221,89.0,72
NY,2,77,87.0,20
TX,1,132,,52
TX,2,205,60.0,55


In [164]:
# Look up data for NY in month 1: NY_month1
NY_month1 = sales.loc[('NY', 1), :]

NY_month1

# Look up data for CA and TX in month 2: CA_TX_month2
CA_TX_month2 = sales.loc[(['CA', 'TX'], 2), :]

CA_TX_month2

# Look up data for all states in month 2: all_month2
all_month2 = sales.loc[(slice(None), 2), :]

all_month2

eggs   221.000
salt    89.000
spam    72.000
Name: (NY, 1), dtype: float64

Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2,110,50.0,31
TX,2,205,60.0,55


Unnamed: 0_level_0,Unnamed: 1_level_0,eggs,salt,spam
state,month,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2,110,50.0,31
NY,2,77,87.0,20
TX,2,205,60.0,55


# Rearranging and reshaping data

## Pivoting DataFrames

## Stacking & unstacking DataFrames

## Melting DataFrames

## Pivot tables

# Grouping data

## Categoricals and groupby

## Groupby and aggregation

## Groupby and transformation

## Groupby and filtering

# Bringing it all together

## Case Study - Summer Olympics

## Understanding the column labels

## Constructing alternative country rankings

## Reshaping DataFrames for visualization

# Wrap Up