In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [8]:
data = pd.read_csv('AppleStore.csv') 

In [9]:
data.head() #first five rows of the table

Unnamed: 0,id,track_name,size_bytes,currency,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,ver,cont_rating,prime_genre,sup_devices.num,ipadSc_urls.num,lang.num,vpp_lic
0,284882215,Facebook,389879808,USD,0.0,2974676,212,3.5,3.5,95.0,4+,Social Networking,37,1,29,1
1,389801252,Instagram,113954816,USD,0.0,2161558,1289,4.5,4.0,10.23,12+,Photo & Video,37,0,29,1
2,529479190,Clash of Clans,116476928,USD,0.0,2130805,579,4.5,4.5,9.24.12,9+,Games,38,5,18,1
3,420009108,Temple Run,65921024,USD,0.0,1724546,3842,4.5,4.0,1.6.2,9+,Games,40,5,1,1
4,284035177,Pandora - Music & Radio,130242560,USD,0.0,1126879,3594,4.0,4.5,8.4.1,12+,Music,37,4,1,1


In [10]:
print(data.columns)

Index(['id', 'track_name', 'size_bytes', 'currency', 'price',
       'rating_count_tot', 'rating_count_ver', 'user_rating',
       'user_rating_ver', 'ver', 'cont_rating', 'prime_genre',
       'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'],
      dtype='object')


In [11]:
columns_to_drop = ['id', 'currency', 'rating_count_tot','rating_count_ver','user_rating_ver','ver','cont_rating', 'sup_devices.num','ipadSc_urls.num','vpp_lic']
data.drop(columns=columns_to_drop, inplace=True)
data.head()

Unnamed: 0,track_name,size_bytes,price,user_rating,prime_genre,lang.num
0,Facebook,389879808,0.0,3.5,Social Networking,29
1,Instagram,113954816,0.0,4.5,Photo & Video,29
2,Clash of Clans,116476928,0.0,4.5,Games,18
3,Temple Run,65921024,0.0,4.5,Games,1
4,Pandora - Music & Radio,130242560,0.0,4.0,Music,1


In [12]:
# Rename columns in place >>>>>This code will already be exectued
data.rename(columns={'price':'price_dollars'}, inplace=True)
data.head()

Unnamed: 0,track_name,size_bytes,price_dollars,user_rating,prime_genre,lang.num
0,Facebook,389879808,0.0,3.5,Social Networking,29
1,Instagram,113954816,0.0,4.5,Photo & Video,29
2,Clash of Clans,116476928,0.0,4.5,Games,18
3,Temple Run,65921024,0.0,4.5,Games,1
4,Pandora - Music & Radio,130242560,0.0,4.0,Music,1


In [13]:
available_categories = data['prime_genre'].unique()
print(available_categories)

['Social Networking' 'Photo & Video' 'Games' 'Music' 'Reference'
 'Health & Fitness' 'Weather' 'Utilities' 'Travel' 'Shopping' 'News'
 'Navigation' 'Lifestyle' 'Entertainment' 'Food & Drink' 'Sports' 'Book'
 'Finance' 'Education' 'Productivity' 'Business' 'Catalogs' 'Medical']


In [14]:
# encode categorical variables
encoder = LabelEncoder()
data['prime_genre'] = encoder.fit_transform(data['prime_genre'])
print(data['prime_genre'].unique())

[18 14  7 11 16  8 22 21 20 17 13 12  9  4  6 19  0  5  3 15  1  2 10]


In [15]:
# select the relevant features and target variable
X = data[['size_bytes','price_dollars','prime_genre','lang.num']]
y = data['user_rating']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# select the relevant features and target variable
X = data[['size_bytes', 'price_dollars','prime_genre', 'lang.num']]
y = data['user_rating']

# split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# train the model on the training set
model = KNeighborsRegressor(n_neighbors=100)
model.fit(X_train, y_train)

# predict the target variable for the test set
y_pred = model.predict(X_test)

# calculate r2-score, mean absolute error, and root mean squared error
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"The r2-score of the model is {r2:.2f}")
print(f"The mean absolute error of the model is {mae:.2f}")
print(f"The root mean squared error of the model is {rmse:.2f}")

The r2-score of the model is 0.05
The mean absolute error of the model is 1.08
The root mean squared error of the model is 1.44


In [18]:
import pandas as pd
import warnings

warnings.filterwarnings("ignore") # to avoid the printing of a warning which does not affect the code

# define the bins and labels
bins = [0, 1, 2, 3, 4, 5] # the bins are based on the rating scale from 1 to 5
labels = ['Low', 'Medium-Low', 'Medium', 'Medium-High', 'High']

# get user inputs
print("Available app genres are : ")
for i in available_categories:
    print(i)

while True:
  user_category = input('Enter the genre of the app : ')
  if user_category in available_categories:
     break
  else :
      print(f"{user_category} is not a valid genre. Please enter a valid genre.")

while True:
  try:
    price = float(input('Enter the price of the app in dollars : '))
    if price<0:
      raise ValueError
    break
  except ValueError:
    print("Price should be a positive number.Please enter a valid price")

while True:
  try:
    size = float(input('Enter the size of the app in bytes : '))
    if size<0:
      raise ValueError
    break
  except ValueError:
        print("Size should be a positive number.Please enter a valid size")

while True:
  try:
    lang_num = float(input('Enter the number of supported languages : '))
    if lang_num<0:
      raise ValueError
    break
  except ValueError:
        print("Language number should be a positive number.Please enter a valid number")

# encode user input category
user_category = encoder.transform([user_category])[0]

# predict the user rating
rating = model.predict([[size,price,user_category,lang_num]])[0]

# categorize the success rate
success_category = pd.cut([rating], bins=bins, labels=labels)[0]

# print the result
print(f"The predicted rating of the app will be around {rating}, which falls under the {success_category} success category.")

Available app genres are : 
Social Networking
Photo & Video
Games
Music
Reference
Health & Fitness
Weather
Utilities
Travel
Shopping
News
Navigation
Lifestyle
Entertainment
Food & Drink
Sports
Book
Finance
Education
Productivity
Business
Catalogs
Medical
game is not a valid genre. Please enter a valid genre.
The predicted rating of the app will be around 3.76, which falls under the Medium-High success category.
