# Data Prep

First we import some python libraries for data manipulation

In [59]:
%matplotlib inline
from matplotlib import pyplot as plt

import numpy as np
import pandas as pd

import dateutil.parser

## Grab Our Raw Data

First let's grab the data we downloaded from the Kaggle

In [2]:
features = pd.read_csv('data/raw/googleplaystore.csv')
reviews = pd.read_csv('data/raw/googleplaystore_user_reviews.csv')

In [4]:
features.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [18]:
reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


## Cleaning Data
Next we're going to drop any values that contains NaN values, as we have enough data that there should be no adverse effects to it.

In [3]:
features_final = features.dropna()
reviews_final = reviews.dropna()

In [7]:
reviews_final.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3


# Save DataFrames

In [13]:
features_final.to_csv('data/prepped/features_final.csv', index=False)
reviews_final.to_csv('data/prepped/reviews_final.csv', index=False)

# Further Prep

In [14]:
features_final.columns = ['app', 'category', 'rating', 'reviews', 'size', 'installs', 'type',
       'price', 'content_rating', 'genres', 'last_updated', 'current_ver',
       'android_ver']

features_final.head(2)

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [15]:
play_features = features_final[['app', 'category', 'size', 'type',
       'price', 'content_rating', 'genres', 'last_updated', 'current_ver', 'android_ver']]

play_outcomes = features_final[['rating', 'reviews', 'installs']]

In [16]:
play_features['android_ver'] = [s.replace(" and up", "+") for s in play_features['android_ver']]
play_features['android_ver'] = [s.replace(" ", "_") for s in play_features['android_ver']]

play_features.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3+


In [17]:
play_features.to_csv('data/prepped/play_features.csv', index=False)
play_outcomes.to_csv('data/prepped/play_outcomes.csv', index=False)

In [18]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

In [23]:
#play_features['android_ver'] = [s.replace("+", "") for s in play_features['android_ver']]
play_features.columns = ['app', 'category', 'size', 'type', 'price', 'content_rating', 'genres',
       'last_updated', 'current_ver', 'android_min_ver']
play_features.head(1)

Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_min_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3


In [24]:
play_features.to_csv('data/prepped/play_features.csv', index=False)
play_outcomes.to_csv('data/prepped/play_outcomes.csv', index=False)

In [25]:
play_features['android_min_ver'] = [s.replace(".", "_") for s in play_features['android_min_ver']]

In [26]:
play_features.to_csv('data/prepped/play_features.csv', index=False)
play_outcomes.to_csv('data/prepped/play_outcomes.csv', index=False)

# More Data Prep 3/10/19

In [35]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

In [36]:
play_features.head(1)

Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_min_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4_0_3


### Replace 'size' attribute with actual values

In [37]:
play_features['size'] = [s.replace("k", "000") for s in play_features['size']]
play_features['size'] = [s.replace("M", "000000") for s in play_features['size']]
play_features['size'] = [s.replace(".", "") for s in play_features['size']]
play_features['size'] = [s.replace('Varies with device', "varies") for s in play_features['size']]

play_features[play_features['size'] == 'Varies with device'].shape
print(play_features.shape)

play_features_no_varies = play_features[play_features['size'] != 'varies']

play_features_no_varies['size'] = [float(s) for s in play_features_no_varies['size']]
mean_size_known = play_features_no_varies['size'].mean()

play_features['size'] = [s.replace("varies", str(mean_size_known)) for s in play_features['size']]

play_features['size'] = [float(s) for s in play_features['size']]

play_features.head(1)

(9360, 10)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_min_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19000000.0,Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4_0_3


In [38]:
play_features.to_csv('data/prepped/play_features.csv', index=False)
play_outcomes.to_csv('data/prepped/play_outcomes.csv', index=False)

### Converting price into useful types

In [47]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

In [48]:
play_features.price.unique()

array(['0', '$4.99', '$3.99', '$6.99', '$7.99', '$5.99', '$2.99', '$3.49',
       '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49', '$10.00',
       '$24.99', '$11.99', '$79.99', '$16.99', '$14.99', '$29.99',
       '$12.99', '$2.49', '$10.99', '$1.50', '$19.99', '$15.99', '$33.99',
       '$39.99', '$3.95', '$4.49', '$1.70', '$8.99', '$1.49', '$3.88',
       '$399.99', '$17.99', '$400.00', '$3.02', '$1.76', '$4.84', '$4.77',
       '$1.61', '$2.50', '$1.59', '$6.49', '$1.29', '$299.99', '$379.99',
       '$37.99', '$18.99', '$389.99', '$8.49', '$1.75', '$14.00', '$2.00',
       '$3.08', '$2.59', '$19.40', '$3.90', '$4.59', '$15.46', '$3.04',
       '$13.99', '$4.29', '$3.28', '$4.60', '$1.00', '$2.95', '$2.90',
       '$1.97', '$2.56', '$1.20'], dtype=object)

In [49]:
play_features['price'] = [s.replace("$", "") for s in play_features['price']]

play_features['price'] = [float(s) for s in play_features['price']]

play_features.price.unique()

array([  0.  ,   4.99,   3.99,   6.99,   7.99,   5.99,   2.99,   3.49,
         1.99,   9.99,   7.49,   0.99,   9.  ,   5.49,  10.  ,  24.99,
        11.99,  79.99,  16.99,  14.99,  29.99,  12.99,   2.49,  10.99,
         1.5 ,  19.99,  15.99,  33.99,  39.99,   3.95,   4.49,   1.7 ,
         8.99,   1.49,   3.88, 399.99,  17.99, 400.  ,   3.02,   1.76,
         4.84,   4.77,   1.61,   2.5 ,   1.59,   6.49,   1.29, 299.99,
       379.99,  37.99,  18.99, 389.99,   8.49,   1.75,  14.  ,   2.  ,
         3.08,   2.59,  19.4 ,   3.9 ,   4.59,  15.46,   3.04,  13.99,
         4.29,   3.28,   4.6 ,   1.  ,   2.95,   2.9 ,   1.97,   2.56,
         1.2 ])

In [50]:
play_features.to_csv('data/prepped/play_features.csv', index=False)
play_outcomes.to_csv('data/prepped/play_outcomes.csv', index=False)

### Converting dates into useful info

In [63]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

play_features.last_updated.unique()

array(['January 7, 2018', 'January 15, 2018', 'August 1, 2018', ...,
       'January 20, 2014', 'February 16, 2014', 'March 23, 2014'],
      dtype=object)

In [64]:
today = dateutil.parser.parse("2019-03-10").date()

play_features['days_since_update'] = [(today - dateutil.parser.parse(s).date()).days for s in play_features['last_updated']]

play_features['days_since_update'] = [float(s) for s in play_features['days_since_update']]

In [65]:
play_features.head(1)

Unnamed: 0,app,category,size,type,price,content_rating,genres,last_updated,current_ver,android_min_ver,days_since_update
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19000000.0,Free,0.0,Everyone,Art & Design,"January 7, 2018",1.0.0,4_0_3,427.0


In [66]:
play_features.to_csv('data/prepped/play_features.csv', index=False)
play_outcomes.to_csv('data/prepped/play_outcomes.csv', index=False)

### Making genres useful

In [77]:
play_features = pd.read_csv('data/prepped/play_features.csv')
play_outcomes = pd.read_csv('data/prepped/play_outcomes.csv')

#play_features['genres'].count(';').unique()

num_colons = [s.count(';') for s in play_features['genres']]



1