In [12]:
import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from datetime import datetime as dt
today_locked = dt.date(dt.strptime('January 1, 2021', '%B %d, %Y'))

# Objective

 - Explore the dataset

 - Build a predictive model for 'Rating'
 - Build a UI to predict 'Rating' for a prospective app

 - Build a function to search for apps
 - Build a UI for the function


In [13]:
revws = pd.read_csv('Data/googleplaystore_user_reviews.csv')
apps = pd.read_csv('Data/googleplaystore.csv')
apps = apps[apps['Category'] != '1.9']

# DROPPING SHITTING AWKWARD VARIABLES
apps = apps.drop(['Current Ver'], axis = 1)
apps = apps[pd.notnull(apps['Rating'])]
apps = apps.reset_index(drop = True)

In [14]:
# EASY FIXES

def dumfun(df, oldname, prefix):
    dums = pd.get_dummies(df[oldname])
    dums.columns = [prefix+'_'+i.replace(' ','') for i in dums.columns]
    new_df = pd.concat([df.drop([oldname], axis = 1), dums.iloc[:,:-1]], axis = 1)
    return new_df

apps['Reviews'] = apps['Reviews'].apply(lambda x: int(str(x).strip()))
apps['Installs'] = apps['Installs'].apply(lambda x: int(str(x).replace('+','').replace(',','')))
apps['IsFree'] = apps['Type'].apply(lambda x: 1 if x == 'Free' else 0)
apps = apps.drop(['Type'], axis = 1)
apps['Price'] = apps['Price'].apply(lambda x: float(x.replace('$','')))
apps = dumfun(apps, 'Content Rating', 'Rating')
apps = dumfun(apps, 'Category', 'Category')

In [15]:
# FIXING SIZE

apps['Size_varies'] = [1 if x == 'Varies with device' else 0 for x in apps['Size']]
#placeholder -1 for imputation later
apps['Size'] = apps.apply(lambda x: -1 if x['Size_varies'] == 1 else x['Size'], axis = 1)

def size_adjust(x):
    if x == -1:
        returnable = x
    elif x[-1] == 'M':
        returnable = float(x.replace('M',''))*1000
    elif x[-1] == 'k':
        returnable = float(x.replace('k',''))
    return returnable

apps['Size'] = apps['Size'].apply(lambda x: size_adjust(x))
apps = apps.rename(columns={'Size': 'Size_kb'})

In [16]:
# FIXING GENRES

genres = list(apps['Genres'].value_counts().index)
outer = []
for i in genres:
    for k in i.split(';'):
        outer.append(k)
genres = list(np.unique(outer))
listed_lists = []
for i in apps['Genres']:
    listed_lists.append(i.split(';'))
listed_lists
genres_df = pd.DataFrame(columns = genres, index = range(len(listed_lists))).fillna(0)
for i in range(len(listed_lists)):
    for k in listed_lists[i]:
        genres_df.loc[i,k] = 1

genres_df.columns = ['genre_' + i.replace(' ', '_') for i in genres_df.columns]
apps = pd.concat([apps.drop(['Genres'], axis = 1), genres_df], axis = 1)

In [17]:
#FIXING VERSIONS

version_mapping = pd.DataFrame({'Android Ver' : apps['Android Ver'].unique()})
version_mapping['Ver_DeviceVar']  = version_mapping.apply(
    lambda x: 1 if (x['Android Ver'] == 'Varies with device' or pd.isnull(x['Android Ver'])) else 0, axis = 1)
version_mapping['Ver_UpperLimit'] = version_mapping.apply(
    lambda x: 0 if ('and up' in str(x['Android Ver']) or x['Ver_DeviceVar'] == 1) else 1, axis = 1)
version_mapping['Ver_MaxVer'] = version_mapping.apply(
    lambda x: x['Android Ver'].split(' - ')[-1] if x['Ver_UpperLimit'] == 1 else 0, axis = 1)
version_mapping['Ver_MaxVer'] = version_mapping['Ver_MaxVer'].apply(
    lambda x: 0 if x == 0 else int(x[:x.index('.')]))

apps = apps.merge(version_mapping, on = 'Android Ver', how = 'left')
apps = apps.drop(['Android Ver'], axis = 1)

In [18]:
# FIXING UPDATE DATE

apps['LastUpdate_dt'] = apps['Last Updated'].apply(lambda x: dt.date(dt.strptime(x, '%B %d, %Y')))
apps['Days_since_update'] = apps['LastUpdate_dt'].apply(lambda x: int((today_locked - x).days))
apps = apps.drop(['Last Updated', 'LastUpdate_dt'], axis = 1)

In [19]:
# FINISHING TOUCHES

apps['ID'] = [100000 + i for i in range(len(apps))]
listed_cols = list(apps.columns)
listed_cols.remove('ID')
listed_cols.insert(0, 'ID')
raw_df = apps[listed_cols]

raw_df.to_csv('./Data/app_data_clean.csv', index = False)

In [20]:
X = raw_df.drop(['ID', 'App', 'Rating'], axis = 1)
y = raw_df['Rating'].values

In [10]:
from sklearn.model_selection import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7024, 100)
(2342, 100)
(7024,)
(2342,)
