<a href="https://colab.research.google.com/github/ExxLiang193/DataAnalysisPractice/blob/master/downloads_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import re
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [0]:
with open('googleplaystore.csv', 'r') as psf:
    store_data = pd.read_csv(psf)
store_data.drop([10472], inplace=True) # Invalid entry

In [0]:
VARIABLE = 'Varies with device'

# Mapping helper functions

def size_parse(size):
    if size == VARIABLE:
        return -1.
    divisor = 1
    if size.endswith('k'):
        divisor = 1000
    size = float(size.strip('Mk'))
    return size / divisor

def multihot_encode(col_data, col_name, sep=',', prefix=''):
    temp = pd.DataFrame()
    temp[col_name] = col_data.map(lambda entry: entry.split(sep))
    temp = temp.explode(col_name)
    dummies = pd.pivot_table(temp, index=temp.index, columns=[col_name],
                             aggfunc=len, fill_value=0)
    if prefix:
        dummies.columns = [prefix + col_name for col_name in dummies.columns]
    return dummies

def parse_version(vers):
    match = re.match(r'(\d(?:[\-._]\d+)*)', vers)
    if match is None:
        return -1
    head, *tail = re.split(r'[\-._]', match.group(1), maxsplit=1)
    return float('{0}.{1}'.format(head, re.sub(r'[\-._]', '', (tail or [''])[0])))

def parse_version_supports(vers):
    match = re.match(r'(.*) and up', vers)
    if match is not None:
        return [parse_version(match.group(1).strip('W')), 0]
    match = re.match(r'(.*) - (.*)', vers)
    if match is not None:
        return [parse_version(match.group(1)), 1]
    return [-1., 0]

In [0]:
feature_data = pd.DataFrame()
feature_data['Current Ver'] = store_data['Current Ver'].astype(str)
feature_data['Android Ver'] = store_data['Android Ver'].astype(str)

category_dummies = pd.get_dummies(store_data['Category'], prefix='CAT', prefix_sep=':')
feature_data[category_dummies.columns] = category_dummies

feature_data['norm_rating'] = (store_data['Rating'] / 5.).fillna(-1)
feature_data['unknown_rating'] = (feature_data['norm_rating'] == -1).astype(int)

feature_data['abs_size'] = store_data['Size'].map(size_parse)
feature_data['unknown_size'] = (feature_data['abs_size'] == -1).astype(int)

feature_data['log_installs'] = np.log(store_data['Installs'].map(
    lambda i: int(i.strip('+').replace(',', '')) + 1))

price_type_dummies = pd.get_dummies(store_data['Type'], prefix='PRICE_TYPE',
                                    prefix_sep=':', dummy_na=True)
feature_data[price_type_dummies.columns] = price_type_dummies

content_rating_dummies = pd.get_dummies(store_data['Content Rating'], prefix='CONTENT_RATING',
                                        prefix_sep=':', dummy_na=True)
feature_data[content_rating_dummies.columns] = content_rating_dummies

genre_dummies = multihot_encode(store_data['Genres'], 'Genres', sep=';', prefix='GENRE:')
feature_data[genre_dummies.columns] = genre_dummies

feature_data['dec_cur_version'] = feature_data['Current Ver'].map(parse_version)
feature_data['unknown_cur_version'] = (feature_data['dec_cur_version'] == -1).astype(int)
feature_data.drop('Current Ver', axis=1, inplace=True)

feature_data['android_version_range'] = feature_data['Android Ver'].map(parse_version_supports)
feature_data[['min_android_version', 'maxed_android_version']] = pd.DataFrame(
    feature_data['android_version_range'].values.tolist())
feature_data['unknown_android_version'] = (feature_data['min_android_version'] == -1).astype(int)
feature_data.drop(['Android Ver', 'android_version_range'], axis=1, inplace=True)

In [0]:
LABELS = ['log_installs']

def training(feature_data):
    features = feature_data.copy()
    labels = np.array(features[LABELS])
    features.drop(LABELS, axis=1, inplace=True)
    feature_list = list(features.columns)
    features = np.array(features)

    from sklearn.model_selection import train_test_split
    train_features, test_features, train_labels, test_labels = \
        train_test_split(features, labels, test_size = 0.1)
    
    print('Training Features Shape:', train_features.shape)
    print('Training Labels Shape:', train_labels.shape)
    print('Testing Features Shape:', test_features.shape)
    print('Testing Labels Shape:', test_labels.shape)

    from sklearn.ensemble import RandomForestRegressor
    rf = RandomForestRegressor(n_estimators = 20)
    rf.fit(train_features, train_labels)

    # Use the forest's predict method on the test data
    pred = rf.predict(test_features)

    from sklearn import metrics
    print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(test_labels, pred))
    print('Mean Squared Error (MSE):', metrics.mean_squared_error(test_labels, pred))
    print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(test_labels, pred)))

training(feature_data.dropna())