In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
data = pd.read_csv('googleplaystore.csv') 

In [3]:
data.head() #first five rows of the table

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [4]:
columns_to_drop = ['Type', 'Content Rating', 'Genres','Last Updated','Current Ver','Android Ver', 'Rating','Reviews']
data.drop(columns=columns_to_drop, inplace=True)
data.head()

Unnamed: 0,App,Category,Size,Installs,Price
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,"10,000+",0
1,Coloring book moana,ART_AND_DESIGN,14M,"500,000+",0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,8.7M,"5,000,000+",0
3,Sketch - Draw & Paint,ART_AND_DESIGN,25M,"50,000,000+",0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,2.8M,"100,000+",0


In [5]:
# Rename columns in place >>>>>This code will already be exectued
data.rename(columns={'Size':'Size Bytes','Price':'Price Dollars'}, inplace=True)
data.head()

Unnamed: 0,App,Category,Size Bytes,Installs,Price Dollars
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19M,"10,000+",0
1,Coloring book moana,ART_AND_DESIGN,14M,"500,000+",0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,8.7M,"5,000,000+",0
3,Sketch - Draw & Paint,ART_AND_DESIGN,25M,"50,000,000+",0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,2.8M,"100,000+",0


In [6]:
data['Price Dollars'] = data['Price Dollars'].astype(str)
data['Price Dollars'] = data['Price Dollars'].str.replace('$', '')
data['Price Dollars'] = data['Price Dollars'].replace('Everyone',np.nan)
print(data['Price Dollars'].dtype)
print(data['Price Dollars'])
data.tail()

object
0        0
1        0
2        0
3        0
4        0
        ..
10836    0
10837    0
10838    0
10839    0
10840    0
Name: Price Dollars, Length: 10841, dtype: object


  data['Price Dollars'] = data['Price Dollars'].str.replace('$', '')


Unnamed: 0,App,Category,Size Bytes,Installs,Price Dollars
10836,Sya9a Maroc - FR,FAMILY,53M,"5,000+",0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,3.6M,100+,0
10838,Parkinson Exercices FR,MEDICAL,9.5M,"1,000+",0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,Varies with device,"1,000+",0
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,19M,"10,000,000+",0


In [7]:
data['Price Dollars'] = data['Price Dollars'].astype(float)

In [8]:

# Convert column to string
data['Installs'] = data['Installs'].astype(str)

# Remove "+" and "," characters from "Installs" column
data['Installs'] = data['Installs'].str.replace('+', '').str.replace(',', '')


# Replace "Free" values with NaN
data['Installs'] = data['Installs'].replace('Free', np.nan)



# Convert column to float
data['Installs'] = data['Installs'].astype(float)

# Drop rows with NaN aka not a number values in "Installs" and "Price Dollars" columns
data.dropna(subset=['Installs'], inplace=True)
data.dropna(subset=['Price Dollars'], inplace=True)


# Check data type of column
print(data['Installs'].dtype)
print(data['Installs'])
data.tail()

float64
0           10000.0
1          500000.0
2         5000000.0
3        50000000.0
4          100000.0
            ...    
10836        5000.0
10837         100.0
10838        1000.0
10839        1000.0
10840    10000000.0
Name: Installs, Length: 10840, dtype: float64


  data['Installs'] = data['Installs'].str.replace('+', '').str.replace(',', '')


Unnamed: 0,App,Category,Size Bytes,Installs,Price Dollars
10836,Sya9a Maroc - FR,FAMILY,53M,5000.0,0.0
10837,Fr. Mike Schmitz Audio Teachings,FAMILY,3.6M,100.0,0.0
10838,Parkinson Exercices FR,MEDICAL,9.5M,1000.0,0.0
10839,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,Varies with device,1000.0,0.0
10840,iHoroscope - 2018 Daily Horoscope & Astrology,LIFESTYLE,19M,10000000.0,0.0


In [9]:
import re

# Define a function to convert size to bytes
def convert_size(size_str):
    if size_str == 'Varies with device':
        return np.nan

    size, unit = re.match(r"^([\d\.]+)([a-zA-Z]*)$", size_str).groups()
    size = float(size)

    if unit == 'K':
        return size * 1024
    elif unit == 'M':
        return size * 1024 * 1024
    elif unit == 'G':
        return size * 1024 * 1024 * 1024
    else:
        return size

# Apply the conversion function to the 'Size Bytes' column
data['Size Bytes'] = data['Size Bytes'].apply(convert_size)

# Drop rows with missing values in the 'Size Bytes' column
data = data.dropna(subset=['Size Bytes'])

data.head()

Unnamed: 0,App,Category,Size Bytes,Installs,Price Dollars
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,19922944.0,10000.0,0.0
1,Coloring book moana,ART_AND_DESIGN,14680064.0,500000.0,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,9122611.2,5000000.0,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,26214400.0,50000000.0,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,2936012.8,100000.0,0.0


In [10]:
available_categories = data['Category'].unique()
print(available_categories)

['ART_AND_DESIGN' 'AUTO_AND_VEHICLES' 'BEAUTY' 'BOOKS_AND_REFERENCE'
 'BUSINESS' 'COMICS' 'COMMUNICATION' 'DATING' 'EDUCATION' 'ENTERTAINMENT'
 'EVENTS' 'FINANCE' 'FOOD_AND_DRINK' 'HEALTH_AND_FITNESS' 'HOUSE_AND_HOME'
 'LIBRARIES_AND_DEMO' 'LIFESTYLE' 'GAME' 'FAMILY' 'MEDICAL' 'SOCIAL'
 'SHOPPING' 'PHOTOGRAPHY' 'SPORTS' 'TRAVEL_AND_LOCAL' 'TOOLS'
 'PERSONALIZATION' 'PRODUCTIVITY' 'PARENTING' 'WEATHER' 'VIDEO_PLAYERS'
 'NEWS_AND_MAGAZINES' 'MAPS_AND_NAVIGATION']


In [11]:
# encode categorical variables
encoder = LabelEncoder()
data['Category'] = encoder.fit_transform(data['Category'])

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# select the relevant features and target variable
X = data[['Category','Size Bytes', 'Price Dollars']]
y = data['Installs']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the model on the training set
model = KNeighborsRegressor(n_neighbors=100)
model.fit(X_train, y_train)

# predict the target variable for the test set
y_pred = model.predict(X_test)

# calculate r2-score, mean absolute error, and root mean squared error
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"The r2-score of the model is {r2:.2f}")
print(f"The mean absolute error of the model is {mae:.2f}")
print(f"The root mean squared error of the model is {rmse:.2f}")

The r2-score of the model is 0.10
The mean absolute error of the model is 10739343.51
The root mean squared error of the model is 47785063.34


In [13]:
from sklearn.neighbors import KNeighborsRegressor
import warnings

warnings.filterwarnings("ignore") # to avoid the printing of a warning which does not affect the code

# train the model
model = KNeighborsRegressor(n_neighbors=10) # n_neoghbor value is 10 
model.fit(X, y)

# define the bins and labels
bins = [0, 1000, 10000, 100000, 1000000, np.inf] #infinity as a upper bound 
labels = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']


# get user inputs
print("Available catggories are : ")
for i in available_categories:
    print(i)

while True:
  category = input('Enter the category of the app : ').upper().replace(" ","_")
  if category in available_categories:
     break
  else :
      print(f"{category} is not a valid category. Please enter a valid category.")

while True:
  try:
    price = float(input('Enter the price of the app in dollars : '))
    if price<0:
      raise ValueError
    break
  except ValueError:
    print("Price should be a positive number.Please enter a valid price")

while True:
  try:
    size = float(input('Enter the size of the app in bytes : '))
    if size<0:
      raise ValueError
    break
  except ValueError:
        print("Size should be a positive number.Please enter a valid size")


# encode user input category
category = encoder.transform([category])[0]

# predict the installs
installs = model.predict([[category, price, size]])[0]

# categorize the success rate
success_category = pd.cut([installs], bins=bins, labels=labels)[0]

# print the result
print(f"The predicted intalls are around {int(installs)}, which falls under the {success_category} success category.")

Available catggories are : 
ART_AND_DESIGN
AUTO_AND_VEHICLES
BEAUTY
BOOKS_AND_REFERENCE
BUSINESS
COMICS
COMMUNICATION
DATING
EDUCATION
ENTERTAINMENT
EVENTS
FINANCE
FOOD_AND_DRINK
HEALTH_AND_FITNESS
HOUSE_AND_HOME
LIBRARIES_AND_DEMO
LIFESTYLE
GAME
FAMILY
MEDICAL
SOCIAL
SHOPPING
PHOTOGRAPHY
SPORTS
TRAVEL_AND_LOCAL
TOOLS
PERSONALIZATION
PRODUCTIVITY
PARENTING
WEATHER
VIDEO_PLAYERS
NEWS_AND_MAGAZINES
MAPS_AND_NAVIGATION
The predicted intalls are around 101511, which falls under the Medium-High success category.
