In [None]:
import pandas as pd
import numpy as np
import json 
import seaborn as sns
import matplotlib.pyplot as plt
import os

from sklearn import linear_model
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble.partial_dependence import plot_partial_dependence

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = 1000

In [None]:
google_play_raw = pd.read_csv('googleplaystore.csv')

In [None]:
google_play_raw.dtypes

In [None]:
google_play_raw.sample(3)

In [None]:
google_play_raw.Category.value_counts()

In [None]:
google_play_raw.Genres.value_counts().head(20)

In [None]:
# Checking if there's any apps have 5 star ratings
fivestars_ratings = google_play_raw.loc[google_play_raw['Rating'] == 5, ['App','text','Rating','Reviews','Installs']]
fivestars_ratings

In [None]:
len(fivestars_ratings)

In [None]:
# Check if there's any app has rating over 5, which means the data might be noisy 
google_play_raw.loc[google_play_raw["Rating"] > 5]

In [None]:
def value_to_float(x):
    """
    Convert the string feature to float. 
    If there are `K` or `M` in the string, convert it to the corresponding number (1000 or 1000000)
    """
    if type(x) == float or type(x) == int:
        return x
    if 'K' in x:
        if len(x) > 1:
            return float(x.replace('K', '')) * 10**3
        return 1000.0
    if 'M' in x:
        if len(x) > 1:
            return float(x.replace('M', '')) * 10**6
        return 1000000.0
    
    # If the feature cannot be converted, return 0 instead
    try:
        parsed_val = float(x)
    except ValueError:
        parsed_val = 0.0
    return parsed_val

In [None]:
# Clean the data
google_play_cleaned = google_play_raw.loc[google_play_raw["Rating"].notnull()]
google_play_cleaned = google_play_cleaned.loc[google_play_cleaned["Rating"] <= 5]

google_play_cleaned["Price"] = google_play_cleaned["Price"].apply(lambda x: x.replace('$', ''))

top30_genres = google_play_cleaned["Genres"].value_counts().head(30).index
google_play_cleaned["Genres"] = google_play_cleaned["Genres"].apply(lambda x: x if x in top30_genres else "Other")

str_cols = ["Reviews", "Size", "Price"]

for col in str_cols:
    google_play_cleaned[[col]] = google_play_cleaned[[col]].fillna(value="")
    google_play_cleaned[col] = google_play_cleaned[col].apply(value_to_float)

In [None]:
google_play_cleaned.describe()

In [None]:
# To get a brief idea about the apps whose prices are higher than $50
google_play_cleaned.loc[google_play_cleaned["Price"] > 50]

In [None]:
# Perform Exploratory Data Analysis (EDA)
%pylab inline

fig = sns.distplot(google_play_cleaned["Rating"], bins=20)   

fig.set_xlabel('Ratings')
plt.title('Rating Distribution')

In [None]:
top10_category = google_play_cleaned["Category"].value_counts().head(10).index
df_top10_category = google_play_cleaned.loc[google_play_cleaned["Category"].isin(top10_category)]

In [None]:
fig = sns.countplot(x="Category", data=df_top10_category)
fig.set_xlabel('Category')

plt.xticks(rotation=45)
plt.title('Categories and the App Count')
plt.ylabel('App Count')

In [None]:
fig = sns.distplot(google_play_cleaned.loc[google_play_cleaned["Reviews"] < 5000, "Reviews"], bins=20)
fig.set_xlabel('Reviews')
plt.title('Review Distribution')

In [None]:
fig = sns.distplot(google_play_cleaned["Size"], bins=20)
fig.set_xlabel('Size')
plt.title('Size Distribution')

In [None]:
fig = sns.distplot(google_play_cleaned["Price"], bins=20)
fig.set_xlabel('Price')
plt.title('Price Distribution')

In [None]:
fig = sns.countplot(x="Installs", data=google_play_cleaned, order=google_play_cleaned['Installs'].value_counts().index)
fig.set_xlabel('Installs')

plt.xticks(rotation=90)
plt.title('App Install Count')
plt.ylabel('App Count')

In [None]:
fig = sns.countplot(x="Type", data=google_play_cleaned)
fig.set_xlabel('Type')
plt.title('App Type Count')
plt.ylabel('App Count')

In [None]:
fig = sns.countplot(x="Content Rating", data=google_play_cleaned)
fig.set_xlabel('Content Rating')
plt.xticks(rotation=45)
plt.title('Content Rating Count')
plt.ylabel('App Count')

In [None]:
top10_genres = google_play_cleaned["Genres"].value_counts().head(10).index
df_top10_genres = google_play_cleaned.loc[google_play_cleaned["Genres"].isin(top10_genres)]

In [None]:
fig = sns.countplot(x="Genres", data=df_top10_genres)
fig.set_xlabel('Genres')

plt.xticks(rotation=45)
plt.title('Top 10 Genres and the App Count')
plt.ylabel('App Count')

In [None]:
top10_android = google_play_cleaned["Android Ver"].value_counts().head(10).index
df_top10_android = google_play_cleaned.loc[google_play_cleaned["Android Ver"].isin(top10_android)]

In [None]:
fig = sns.countplot(x="Android Ver", data=df_top10_android)
fig.set_xlabel('Android Version')

plt.xticks(rotation=45)
plt.title('Top 10 Android Version and the App Count')
plt.ylabel('App Count')

In [None]:
google_play_cleaned.head()

In [None]:
# Create one-hot encoding for categorical features
categorical_feeatures = ["Category", "Installs", "Type", "Content Rating", "Genres", "Android Ver"]

for col in categorical_feeatures:
    one_hot = pd.get_dummies(google_play_cleaned[col], prefix=col+"_")
    google_play_cleaned = google_play_cleaned.drop(col, axis=1)
    google_play_cleaned = google_play_cleaned.join(one_hot)
    
google_play_cleaned = google_play_cleaned.drop(["App", "Last Updated", "Current Ver"], axis=1)

In [None]:
features = google_play_cleaned.drop("Rating", axis=1)
labels = google_play_cleaned["Rating"].values

In [None]:
# Split training/test data
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, random_state=42)

In [None]:
# Make sure feature values fall in similar ranges
min_max_scaler = preprocessing.MinMaxScaler()
X_train_scaled = min_max_scaler.fit_transform(X_train)
X_test_scaled = min_max_scaler.transform(X_test)

In [None]:
# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(X_train_scaled, y_train)

# Make predictions using the train/testing set
y_pred_train = regr.predict(X_train_scaled)
y_pred_test = regr.predict(X_test_scaled)

# The coefficients
print('Coefficients: \n', regr.coef_)

# The mean squared error
print("Mean squared error (train): %.2f"
      % mean_squared_error(y_train, y_pred_train))
print("Mean squared error (test): %.2f"
      % mean_squared_error(y_test, y_pred_test))

# R-squared on training set: 1 is perfect prediction
print('R-squared (train): %.2f' % r2_score(y_train, y_pred_train))
# R-squared on test set: 1 is perfect prediction
print('R-squared (test): %.2f' % r2_score(y_test, y_pred_test))

In [None]:
# Set the parameters for cross-validation
tuned_parameters = [
    {
        'max_depth': [2, 3, 4, 5], 
        'learning_rate': [0.2, 0.1, 0.05, 0.01],
        'subsample': [1.0, 0.9, 0.8]
    }
]

print("# Tuning hyper-parameters")

clf = GridSearchCV(GradientBoostingRegressor(), tuned_parameters, cv=5, scoring="neg_mean_squared_error")
clf.fit(X_train, y_train)

In [None]:
print("Best parameters set found on training set:")
print(clf.best_params_)

In [None]:
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("Mean squared error = %0.3f for %r" % (-mean, params))

In [None]:
best_estimator = clf.best_estimator_

In [None]:
# Make predictions using the train/testing set
y_pred_train = best_estimator.predict(X_train)
y_pred_test = best_estimator.predict(X_test)

# The mean squared error
print("Mean squared error (train): %.2f"
      % mean_squared_error(y_train, y_pred_train))
print("Mean squared error (test): %.2f"
      % mean_squared_error(y_test, y_pred_test))

# R-squared on training set: 1 is perfect prediction
print('R-squared (train): %.2f' % r2_score(y_train, y_pred_train))
# R-squared on test set: 1 is perfect prediction
print('R-squared (test): %.2f' % r2_score(y_test, y_pred_test))

In [None]:
best_n_estimators = best_estimator.get_params()["n_estimators"]

In [None]:
# Plot training deviance

# Compute test set deviance
test_score = np.zeros((best_n_estimators,), dtype=np.float64)
 
for i, y_pred_test in enumerate(best_estimator.staged_predict(X_test)):
    test_score[i] = best_estimator.loss_(y_test, y_pred_test)
    
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.subplots_adjust(wspace=.5)
plt.title('Deviance')
plt.plot(np.arange(best_n_estimators) + 1, best_estimator.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(best_n_estimators) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')

# Plot feature importance
feature_importance = best_estimator.feature_importances_
num_top_features = 20

# Make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)[::-1][0:num_top_features]
pos = np.arange(sorted_idx.shape[0]) + .5
plt.subplot(1, 2, 2)
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, X_train.columns[sorted_idx])
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()

In [None]:
# Plot partial dependence
num_top_features = 5
fig, ax = plot_partial_dependence(best_estimator, X_train, sorted_idx[0:num_top_features], n_cols=5, feature_names=X_train.columns)
fig.set_figwidth(16)