# Google Play Store App Installs Prediction
Dataset: [https://www.kaggle.com/datasets/lava18/google-play-store-apps]

### Objective
Predict App installs based on rating, category, genres, etc...

### Dataset
The dataset includes data of 10841 app on google play and 13 columns describing the data

In [1]:
import numpy as np
import random
import math
import matplotlib.pyplot as plt
import pandas as pd
from dateutil.relativedelta import relativedelta
from datetime import datetime

In [2]:
df = pd.read_csv('googleplaystore.csv')
df = df.rename(columns={'Size': 'App Size'})
df

Unnamed: 0,App,Category,Rating,Reviews,App Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,"July 25, 2017",1.48,4.1 and up
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,"July 6, 2018",1.0,4.1 and up
10838,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,"January 20, 2017",1.0,2.2 and up
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,"January 19, 2015",Varies with device,Varies with device


## 1. Data Preprocessing

### Drop meaningless columns

In [3]:
df.drop(['Type','Current Ver','Android Ver'], axis=1, inplace=True)

### Parse installs number to int

In [4]:
df['Installs'] = df['Installs'].apply(lambda x: x.replace(',', '').replace('+', '') )

### Parse Size to int

In [5]:
df['App Size'] = df['App Size'].apply(lambda x: x.replace('M', ''))

### Parse Price to int

In [6]:
df['Price'] = df['Price'].apply(lambda x: x.replace('$', ''))

### Split 'Content'

In [7]:
df = pd.get_dummies(df, columns = ['Content Rating']) # one hot encoding
df.head()

Unnamed: 0,App,Category,Rating,Reviews,App Size,Installs,Price,Genres,Last Updated,Content Rating_Adults only 18+,Content Rating_Everyone,Content Rating_Everyone 10+,Content Rating_Mature 17+,Content Rating_Teen,Content Rating_Unrated
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,0,Art & Design,"January 7, 2018",0,1,0,0,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,0,Art & Design;Pretend Play,"January 15, 2018",0,1,0,0,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,0,Art & Design,"August 1, 2018",0,1,0,0,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,0,Art & Design,"June 8, 2018",0,0,0,0,1,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,0,Art & Design;Creativity,"June 20, 2018",0,1,0,0,0,0


### Split 'Genres'

In [8]:
# Split the 'Genres' column into multiple columns with a prefix
genres_df = df['Genres'].str.get_dummies(';')

# Concatenate the resulting DataFrame with the original DataFrame
df = pd.concat([df, genres_df], axis=1)

# Display the resulting DataFrame
df

Unnamed: 0,App,Category,Rating,Reviews,App Size,Installs,Price,Genres,Last Updated,Content Rating_Adults only 18+,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19,10000,0,Art & Design,"January 7, 2018",0,...,0,0,0,0,0,0,0,0,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14,500000,0,Art & Design;Pretend Play,"January 15, 2018",0,...,0,0,0,0,0,0,0,0,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,0,Art & Design,"August 1, 2018",0,...,0,0,0,0,0,0,0,0,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25,50000000,0,Art & Design,"June 8, 2018",0,...,0,0,0,0,0,0,0,0,0,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,0,Art & Design;Creativity,"June 20, 2018",0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53,5000,0,Education,"July 25, 2017",0,...,0,0,0,0,0,0,0,0,0,0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,0,Education,"July 6, 2018",0,...,0,0,0,0,0,0,0,0,0,0
10838,Parkinson Exercices FR,MEDICAL,,3,9.5,1000,0,Medical,"January 20, 2017",0,...,0,0,0,0,0,0,0,0,0,0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,1000,0,Books & Reference,"January 19, 2015",0,...,0,0,0,0,0,0,0,0,0,0


### Change 'Last Updated' from date type to int

In [9]:
# Convert Last Updated date to monts passed since last update
def months_diff(date_str):
    # Parse the input string and convert it to a datetime object
    try:
        date = datetime.strptime(date_str, '%B %d, %Y')
    except:
        return ''

    # Calculate the difference between the input date and January 1, 2019
    diff = relativedelta(datetime(2019, 1, 1), date)

    # Return the number of months
    return diff.years * 12 + diff.months


df['Last Updated'] = df['Last Updated'].apply(lambda x: months_diff(x) )
    
df.head()

Unnamed: 0,App,Category,Rating,Reviews,App Size,Installs,Price,Genres,Last Updated,Content Rating_Adults only 18+,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19.0,10000,0,Art & Design,11,0,...,0,0,0,0,0,0,0,0,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14.0,500000,0,Art & Design;Pretend Play,11,0,...,0,0,0,0,0,0,0,0,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,0,Art & Design,5,0,...,0,0,0,0,0,0,0,0,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25.0,50000000,0,Art & Design,6,0,...,0,0,0,0,0,0,0,0,0,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,0,Art & Design;Creativity,6,0,...,0,0,0,0,0,0,0,0,0,0


## 2. Data Cleaning

### Check for missing values and replace them with median

In [10]:
df.isna().sum()

App                           0
Category                      0
Rating                     1474
Reviews                       0
App Size                      0
                           ... 
Travel & Local                0
Trivia                        0
Video Players & Editors       0
Weather                       0
Word                          0
Length: 69, dtype: int64

In [11]:
df["Rating"]=df["Rating"].fillna(df["Rating"].median())
df.isna().sum()

App                        0
Category                   0
Rating                     0
Reviews                    0
App Size                   0
                          ..
Travel & Local             0
Trivia                     0
Video Players & Editors    0
Weather                    0
Word                       0
Length: 69, dtype: int64

### Replace null data and unwanted data

In [12]:
#Checking for missing, or null, data
df.isnull().sum().sort_values(ascending = False)

App                  0
Maps & Navigation    0
Parenting            0
News & Magazines     0
Music & Video        0
                    ..
Comics               0
Communication        0
Creativity           0
Dating               0
Word                 0
Length: 69, dtype: int64

### Find indexes where app size is in KB

In [13]:
k_indexes = []
for i in range (len(df)):
    if 'k' in df['App Size'][i]:
        k_indexes.append(i)
print(len(k_indexes))

316


### Checking for incorrect data types in numeric columns

In [14]:
#Function to return true if string can be parsed to float
def is_float(string):
    try:
        float(string)
        return True
    except ValueError:
        return False

#Function to return index of values that are not numerical and their count
def find_incorrect_types(p):
    x = []
    for i in range (len(df)):
        if not(p[i].isdigit() or is_float(p[i])):
            x.append(i)
    return x
print(find_incorrect_types(df['Price'].values.astype(str).tolist()))
print("----------------")
print(find_incorrect_types(df['Reviews'].values.astype(str).tolist()))
print("----------------")
print(find_incorrect_types(df['Installs'].values.astype(str).tolist()))

[10472]
----------------
[10472]
----------------
[10472]


In [15]:
#Dropping row with noisy data in all columns
df = df.drop(10472)
df

Unnamed: 0,App,Category,Rating,Reviews,App Size,Installs,Price,Genres,Last Updated,Content Rating_Adults only 18+,...,Simulation,Social,Sports,Strategy,Tools,Travel & Local,Trivia,Video Players & Editors,Weather,Word
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19,10000,0,Art & Design,11,0,...,0,0,0,0,0,0,0,0,0,0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14,500000,0,Art & Design;Pretend Play,11,0,...,0,0,0,0,0,0,0,0,0,0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7,5000000,0,Art & Design,5,0,...,0,0,0,0,0,0,0,0,0,0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25,50000000,0,Art & Design,6,0,...,0,0,0,0,0,0,0,0,0,0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8,100000,0,Art & Design;Creativity,6,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10836,Sya9a Maroc - FR,FAMILY,4.5,38,53,5000,0,Education,17,0,...,0,0,0,0,0,0,0,0,0,0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6,100,0,Education,5,0,...,0,0,0,0,0,0,0,0,0,0
10838,Parkinson Exercices FR,MEDICAL,4.3,3,9.5,1000,0,Medical,23,0,...,0,0,0,0,0,0,0,0,0,0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,1000,0,Books & Reference,47,0,...,0,0,0,0,0,0,0,0,0,0


### Checking for negative values in numerical attributes

In [16]:
#Parsing all numerical attribute values to int or float
df['Price'] = df['Price'].astype(str).astype(float)
df['Installs'] = df['Installs'].astype(str).astype(int)
df['Reviews'] = df['Reviews'].astype(str).astype(int)
df['Rating'] = df['Rating'].astype(str).astype(float)

#Function which returns indexes of negative numbers 
def find_negative (values):
    noise = []
    for i in range (len(df)):
        if values[i] < 0:
            noise.append(i)
    return noise

#Checking if numerical columns have any negative values
print(find_negative(df['Price'].values.tolist()))
print(find_negative(df['Reviews'].values.tolist()))
print(find_negative(df['Installs'].values.tolist()))
print(find_negative(df['Rating'].values.tolist()))

[]
[]
[]
[]


In [17]:
size_indexes = df.index[df['App Size'] == 'Varies with device'].tolist()
print(len(lis))
#App size column contains 1695 records where app size is not specified and 316 records where app size is in KB

NameError: name 'lis' is not defined

### Replacing missing app sizes with median value

In [None]:
#First replacing missing sizes and sizes in KB with 0 to be able to parse column values to int
df['App Size'] = df['App Size'].astype(str)
for i in size_indexes:
    df.loc[i,'App Size'] = '0'

for i in k_indexes:
    df.loc[i,'App Size'] = '0'

In [None]:
#Parsing app size values to float
df['App Size'] = df['App Size'].astype(float)

#Converting values in KB to MB
for i in k_indexes:
    df.loc[i, 'App Size'] = df['App Size'][i] / 1000
    
#Replacing missing values with median app size
for i in size_indexes:
    df.loc[i, 'App Size'] = df['App Size'].median()