# Introduction to Pandas & NumPy - Answers

## Importing the librairies

In [2]:
# import the pandas and NumPy librairies
import pandas as pd
import numpy as np

## Introduction to NumPy

### Creating NumPy arrays

In [5]:
# Create a 1D array of shape (3,)
a = np.array([1,2,3])

print(a.shape)
print(a)

(3,)
[1 2 3]


In [3]:
# Create a 2D array of shape (2, 3) and type float
b = np.array([(1.5,2,3), (4,5,6)], dtype = float)

print(b.shape)
print(b)

(2, 3)
[[1.5 2.  3. ]
 [4.  5.  6. ]]


### Basic NumPy functions

In [4]:
# Create an array with 21 elements uniformly spaced between 0 and 10
first_array = np.linspace(0, 10, 21)
 
print(first_array)

[ 0.   0.5  1.   1.5  2.   2.5  3.   3.5  4.   4.5  5.   5.5  6.   6.5
  7.   7.5  8.   8.5  9.   9.5 10. ]


In [5]:
# Create an array of 21 integers randomly drawn between 0 and 10
np.random.seed(5)
rnd_array = np.random.randint(10, size=21)
 
print(rnd_array)

[3 6 6 0 9 8 4 7 0 0 7 1 5 7 0 1 4 6 2 9 9]


In [6]:
# Cast first_array to an array of integers and stack them together
first_updated = first_array.astype(int)
 
# stack together
stack_array = np.stack((first_updated, rnd_array))

print(stack_array)

[[ 0  0  1  1  2  2  3  3  4  4  5  5  6  6  7  7  8  8  9  9 10]
 [ 3  6  6  0  9  8  4  7  0  0  7  1  5  7  0  1  4  6  2  9  9]]


In [7]:
# What is the average value of stack_array?
print(np.average(stack_array))
 
# What is the row-average of stack_array?
print(np.average(stack_array, axis=1))

4.619047619047619
[4.76190476 4.47619048]


## Introduction to Pandas

### DataFrame & Series

In [8]:
# create a Series of five elements with index a,b,c,d
s = pd.Series([3, -5, 2, 9], index=['a', 'b', 'c', 'd'])

print(s)

a    3
b   -5
c    2
d    9
dtype: int64


In [8]:
# create a DataFrame using the data dict below
data = {'Temperature': [8.5, 10.7, 8.6, 11.2], 
        'Population': [67, 66, 326, 127], 
        'Capital': ['London', 'Paris', 'Washington D.C.', 'Tokyo']}

df = pd.DataFrame(data, index=['UK', 'France', 'USA', 'Japan'])

print(df)

        Temperature  Population          Capital
UK              8.5          67           London
France         10.7          66            Paris
USA             8.6         326  Washington D.C.
Japan          11.2         127            Tokyo


3

### Data selection with iloc & loc

In [10]:
# select part of the DataFrame 1/4 (see slide)
df.iloc[[1, 2]]

Unnamed: 0,Temperature,Population,Capital
France,10.7,66,Paris
USA,8.6,326,Washington D.C.


In [11]:
# select part of the DataFrame 2/4 (see slide)
df.iloc[3, 1:3]

Population      127
Capital       Tokyo
Name: Japan, dtype: object

In [12]:
# select part of the DataFrame 3/4 (see slide)
df.loc[:, ['Temperature']]

Unnamed: 0,Temperature
UK,8.5
France,10.7
USA,8.6
Japan,11.2


In [13]:
# select part of the DataFrame 4/4 (see slide)
df.loc[ df['Population'] > 100, ['Temperature', 'Capital'] ]

Unnamed: 0,Temperature,Capital
USA,8.6,Washington D.C.
Japan,11.2,Tokyo


### Inspecting DataFrames

In [1]:
# Load the googleplaystore.csv file into a pandas DataFrame
google_apps = pd.read_csv('/Users/walch john/Documents/00Others/On-boarding pack for new DS/googleplaystore.csv')

NameError: name 'pd' is not defined

In [15]:
# Print the columns of your DataFrame
print(google_apps.columns)

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')


In [16]:
# Select the first 5 rows of your DataFrame
print(google_apps.head(5))

                                                 App        Category  Rating  \
0     Photo Editor & Candy Camera & Grid & ScrapBook  ART_AND_DESIGN     4.1   
1                                Coloring book moana  ART_AND_DESIGN     3.9   
2  U Launcher Lite – FREE Live Cool Themes, Hide ...  ART_AND_DESIGN     4.7   
3                              Sketch - Draw & Paint  ART_AND_DESIGN     4.5   
4              Pixel Draw - Number Art Coloring Book  ART_AND_DESIGN     4.3   

  Reviews  Size     Installs  Type Price Content Rating  \
0     159   19M      10,000+  Free     0       Everyone   
1     967   14M     500,000+  Free     0       Everyone   
2   87510  8.7M   5,000,000+  Free     0       Everyone   
3  215644   25M  50,000,000+  Free     0           Teen   
4     967  2.8M     100,000+  Free     0       Everyone   

                      Genres      Last Updated         Current Ver  \
0               Art & Design   January 7, 2018               1.0.0   
1  Art & Design;Pretend 

In [17]:
# Show some basic statistical details of your DataFrame
print(google_apps.describe(include='all'))

           App Category       Rating Reviews                Size    Installs  \
count    10841    10841  9367.000000   10841               10841       10841   
unique    9660       34          NaN    6002                 462          22   
top     ROBLOX   FAMILY          NaN       0  Varies with device  1,000,000+   
freq         9     1972          NaN     596                1695        1579   
mean       NaN      NaN     4.193338     NaN                 NaN         NaN   
std        NaN      NaN     0.537431     NaN                 NaN         NaN   
min        NaN      NaN     1.000000     NaN                 NaN         NaN   
25%        NaN      NaN     4.000000     NaN                 NaN         NaN   
50%        NaN      NaN     4.300000     NaN                 NaN         NaN   
75%        NaN      NaN     4.500000     NaN                 NaN         NaN   
max        NaN      NaN    19.000000     NaN                 NaN         NaN   

         Type  Price Content Rating Gen

In [18]:
# Show all the unique values in your DataFrame
print(google_apps['Size'].unique())

['19M' '14M' '8.7M' '25M' '2.8M' '5.6M' '29M' '33M' '3.1M' '28M' '12M'
 '20M' '21M' '37M' '2.7M' '5.5M' '17M' '39M' '31M' '4.2M' '7.0M' '23M'
 '6.0M' '6.1M' '4.6M' '9.2M' '5.2M' '11M' '24M' 'Varies with device'
 '9.4M' '15M' '10M' '1.2M' '26M' '8.0M' '7.9M' '56M' '57M' '35M' '54M'
 '201k' '3.6M' '5.7M' '8.6M' '2.4M' '27M' '2.5M' '16M' '3.4M' '8.9M'
 '3.9M' '2.9M' '38M' '32M' '5.4M' '18M' '1.1M' '2.2M' '4.5M' '9.8M' '52M'
 '9.0M' '6.7M' '30M' '2.6M' '7.1M' '3.7M' '22M' '7.4M' '6.4M' '3.2M'
 '8.2M' '9.9M' '4.9M' '9.5M' '5.0M' '5.9M' '13M' '73M' '6.8M' '3.5M'
 '4.0M' '2.3M' '7.2M' '2.1M' '42M' '7.3M' '9.1M' '55M' '23k' '6.5M' '1.5M'
 '7.5M' '51M' '41M' '48M' '8.5M' '46M' '8.3M' '4.3M' '4.7M' '3.3M' '40M'
 '7.8M' '8.8M' '6.6M' '5.1M' '61M' '66M' '79k' '8.4M' '118k' '44M' '695k'
 '1.6M' '6.2M' '18k' '53M' '1.4M' '3.0M' '5.8M' '3.8M' '9.6M' '45M' '63M'
 '49M' '77M' '4.4M' '4.8M' '70M' '6.9M' '9.3M' '10.0M' '8.1M' '36M' '84M'
 '97M' '2.0M' '1.9M' '1.8M' '5.3M' '47M' '556k' '526k' '76M' '7.6M'

### Updating DataFrames

In [19]:
# Create a new DataFrame called 'updated_apps' that doesn't contain the string 'Varies with device' in the 'Size' column
updated_apps = google_apps[google_apps['Size'] != 'Varies with device']

In [20]:
# Create a function to update a single string of the 'Size' column into a number

def update_value_loop(s):
    # replace 1,000
    s = s.replace('1,000','1000')
 
    # get value and character strings
    val = s[:-1]
    char = s[-1:]
 
    # convert to value
    number = float(val)
 
    # multiply value
    if char == 'k':
        return (number * 1000)
    elif char == 'M':
        return (number * 1000000)
    else:
        return number

First, let's update all the values in the 'Size' column by iterating over the DataFrames' rows and appending the results

In [21]:
%%timeit
app_sizes = []
 
for index, row in updated_apps.iterrows():
    app_sizes.append(update_value_loop(row['Size']))

1 loop, best of 3: 541 ms per loop


Now, let's do the same using pandas' apply() function

In [22]:
%%timeit
app_sizes = updated_apps.apply(lambda row: update_value_loop(row['Size']), axis=1)

10 loops, best of 3: 118 ms per loop


The apply() method is 5x faster!

In [23]:
# Create a similar function that operates on an entire Series at a time

def update_size_values_vectorized(s):
    # replace 1,000
    s = s.replace('1,000+','1000+')
    
    # remove last string
    val = s.str[:-1]
    char = s.str[-1:]
 
    # convert to value
    number = val.astype(float)

    if char.str == 'k':
        return number * 1000
    elif char.str == 'M':
        return number * 1000000
    else:
        return number

It is possible to go even faster by using a vectorized function

In [24]:
%%timeit
app_sizes = update_size_values_vectorized(updated_apps['Size'])

100 loops, best of 3: 6.05 ms per loop


There is a 20x improvement!

In [25]:
# Create a similar vectorized function to update the 'Price' variable of the DataFrame

def update_price(s):
    # Deal with the 'Everyone' issue
    s = s.replace(to_replace='Everyone', value='$0')
    s = s.replace(to_replace='0', value='$0')
     
    # get value
    val = s.str[1:]
 
    # convert to value
    number = val.astype(float)
 
    return number

Let's see how fast our function is updating the Price variable

In [26]:
%%timeit
app_prices = update_price(updated_apps['Price'])

100 loops, best of 3: 4.66 ms per loop


### Memory Management

In [27]:
# Create a function to compare the memory usage when storing a DataFrame's column as string vs category

def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 # convert bytes to KB
    return "{:03.1f} kB".format(usage_mb)

In [28]:
# let's compare the memory usage when storing the DataFrame's 'Category' column as string vs category
print(mem_usage(google_apps['Category']))
print(mem_usage(google_apps['Category'].astype('category')))

699.1 kB
14.2 kB


Converting the column to caterogicals enabled ~50x memory improvement with 0 information loss!

### Pickling

In [29]:
# Apply the updates to the dataset before saving it
updated_apps['Size'] = updated_apps.apply(lambda row: update_value_loop(row['Size']), axis=1)
updated_apps['Price'] = update_price(updated_apps['Price'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [30]:
# Save the updated dataset to pickle
updated_apps.to_pickle('data.pkl')

In [31]:
# Save the updated dataset to pickle and compress it to bz2
updated_apps.to_pickle('data.bz2', compression='bz2')

In [32]:
# Reload the compressed pickle file
pd.read_pickle('data.bz2', compression='bz2')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19000000.0,"10,000+",Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14000000.0,"500,000+",Free,0.0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8700000.0,"5,000,000+",Free,0.0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25000000.0,"50,000,000+",Free,0.0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2800000.0,"100,000+",Free,0.0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5600000.0,"50,000+",Free,0.0,Everyone,Art & Design,"March 26, 2017",1.0,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19000000.0,"50,000+",Free,0.0,Everyone,Art & Design,"April 26, 2018",1.1,4.0.3 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29000000.0,"1,000,000+",Free,0.0,Everyone,Art & Design,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33000000.0,"1,000,000+",Free,0.0,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3100000.0,"10,000+",Free,0.0,Everyone,Art & Design;Creativity,"July 3, 2018",2.8,4.0.3 and up
