In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected = True)

from datetime import datetime, date

import warnings
warnings.filterwarnings('ignore')

In [None]:
playstore = pd.read_csv('../input/google-play-store-apps/googleplaystore.csv')
playstore

In [None]:
playstore_user = pd.read_csv('../input/google-play-store-apps/googleplaystore_user_reviews.csv')
playstore_user

### Data Cleaning

In [None]:
playstore.info()

In [None]:
sns.heatmap(playstore.isnull());

In [None]:
playstore.isnull().sum()

In [None]:
playstore.dropna(axis = 0, inplace = True)

In [None]:
playstore.isnull().sum()

In [None]:
playstore.info()

#### Category

In [None]:
playstore['Category'].describe()

In [None]:
print( len(playstore['Category'].unique()) , 'categories')

print("\n", playstore['Category'].unique())

#### Reviews

In [None]:
playstore['Reviews'].describe()

In [None]:
playstore['Reviews']

In [None]:
# Convert to int
playstore['Reviews'] = playstore['Reviews'].apply(lambda x: int(x))

In [None]:
playstore['Reviews'].describe()

#### Size

In [None]:
playstore['Size'].describe()

In [None]:
playstore['Size']

In [None]:
print( len(playstore['Size'].unique()) , "categories")

print("\n", playstore['Size'].unique())

In [None]:
playstore.Size.value_counts().head()

In [None]:
len(playstore[playstore.Size == 'Varies with device'])

In [None]:
# Replacing 'Varies with device' by nan
playstore['Size'].replace('Varies with device', np.nan, inplace = True)

In [None]:
# Replacing 'k' and 'M'
playstore['Size'].replace('k', '', regex = True, inplace = True)
playstore['Size'].replace('M', '', regex = True, inplace = True)

In [None]:
print( len(playstore['Size'].unique()) , "categories")

print("\n", playstore['Size'].unique())

In [None]:
playstore.Size = pd.to_numeric(playstore.Size)

In [None]:
# fill "Varies with device" with mean of size in each category
playstore['Size'].fillna(playstore.groupby('Category')['Size'].transform('mean'),inplace = True)

In [None]:
playstore.Size.describe()

#### Installs

In [None]:
print( len(playstore['Installs'].unique()) , "categories")

print("\n", playstore['Installs'].unique())

In [None]:
playstore.Installs.value_counts()

In [None]:
playstore.Installs = playstore.Installs.apply(lambda x: x.replace(',',''))
playstore.Installs = playstore.Installs.apply(lambda x: x.replace('+',''))
playstore.Installs = playstore.Installs.apply(lambda x: int(x))

In [None]:
playstore.Installs.unique()

#### Type

In [None]:
playstore.Type.value_counts()

#### Price

In [None]:
playstore.Price.value_counts()

In [None]:
print( len(playstore['Price'].unique()) , "categories")

print("\n", playstore['Price'].unique())

In [None]:
playstore.Price = playstore.Price.apply(lambda x: x.replace('$',''))
playstore['Price'] = playstore['Price'].apply(lambda x: float(x))

In [None]:
playstore.Price.describe()

In [None]:
playstore.Price.unique()

#### Content Rating

In [None]:
playstore['Content Rating'].unique()

In [None]:
playstore['Content Rating'].value_counts()

#### Genres

In [None]:
print( len(playstore['Genres'].unique()) , "categories")

print("\n", playstore['Genres'].unique())

In [None]:
playstore.Genres.value_counts()

In [None]:
# The data is divided between genre and subgenre, so I will only group by genre
playstore['Genres'] = playstore['Genres'].str.split(';').str[0]
playstore['Genres'].replace('Music & Audio', 'Music',inplace = True)

In [None]:
print( len(playstore['Genres'].unique()) , "categories")

print("\n", playstore['Genres'].unique())

#### Last Updated

In [None]:
playstore['Last Updated']

In [None]:
# New column containing the converted dates
playstore['new'] = pd.to_datetime(playstore['Last Updated'])
playstore['new'].describe()

In [None]:
playstore['lastupdate'] = (playstore['new'] -  playstore['new'].max()).dt.days
playstore['lastupdate'].head()

### EDA

In [None]:
playstore.head()

In [None]:
data = playstore
data.drop(['Current Ver', 'Android Ver', 'Last Updated', 'new', 'lastupdate'], axis = 1, inplace = True)
data.head()

#### Distributive relations

In [None]:
fig = px.scatter_matrix(data, dimensions = ['Rating', 'Reviews', 'Size', 'Price', 'Installs'], color = 'Type')
fig.show()

##### **Free apps have the best ratings and the most installs. While the paid apps, despite the lowest installations, have the highest scores. There seems to be little correlation between the data, perhaps due to lack of user feedback.**

#### Numeric relevance

In [None]:
fig = px.treemap(data, path = ['Rating', 'Type'])
fig.show()

In [None]:
fig = px.treemap(data, path = ['Genres', 'Type'])
fig.show()

##### **The largest numbers of free apps are in the 'Tools', 'Entertainment' and 'Education' categories.**

In [None]:
fig = px.treemap(data, path = ['Category'])
fig.show()

##### **Family, Games and Tools has the highest prevalence in the market.**

#### Average rating

In [None]:
# Average rating
data.Rating.mean()

In [None]:
fig = px.histogram(data, x = 'Rating')
fig.show()

#### Distribution

In [None]:
hist = data.drop(['App'], axis = 1)
hist.hist(figsize = (20, 20), color = 'r');

In [None]:
fig = px.histogram(data, x = 'Category', color = 'Category')
fig.show()

##### **The distribution of categories is significantly different**

#### Best performing categories

In [None]:
fig = px.violin(data, x = 'Category', y = 'Rating', color = 'Category')
fig.show()

##### **All apps have a high average rating, with 'books and references' and 'events' having the highest average ratings!**

#### Haw the app sizes impact the app rating?

In [None]:
fig = px.scatter(data, x = 'Size', y = 'Rating', marginal_x = 'histogram', marginal_y = 'histogram')
fig.show()

##### **Most highly rated apps are between 2 and 100 in size**

In [None]:
fig = px.scatter(data, x = 'Price', y = 'Rating', marginal_x = 'histogram', marginal_y = 'histogram')
fig.show()

##### **Top rated apps are between zero and thirty dollars.**

#### Pricing trend by categorie

In [None]:
fig = px.scatter(data, x = 'Price', y = 'Category', color = 'Category')
fig.show()

##### **I will check what these outliers are**

In [None]:
print(data[data.Price > 200])

##### **The different versions of the 'I'm Rich' app appear to have considerable amounts of reviews, so I won't delete them.**

In [None]:
# Filter "I'm Rich"
filt = data
filt = filt[filt['Price'] < 200]
fig = px.scatter(filt, x = 'Price', y = 'Category', color = 'Category')
fig.show()

##### **Family, lifestyle, sports and medical are the most expensive apps, with medical costing as much as 80 dollars! All others are under 30 dollars.**

#### Distribution of paid and free apps across categories

In [None]:
fig = px.sunburst(data, path = ['Category', 'Type'], width = 800, height = 600)
fig.show()

#### Number of downloads of paid apps VS free apps

In [None]:
filt = data.copy()
filt

In [None]:
filt['Installs'] = np.log(filt['Installs'])
filt

In [None]:
fig = px.box(filt, x = 'Type', y = 'Installs', color = 'Type')
fig.show()

##### **Paid apps have a relatively lower number of downloads than free apps. That is, obviously for monetary reasons, people prefer free apps.**

#### Paid and free app size variation

In [None]:
fig = px.scatter(data, x = 'Rating', y = 'Size', color = 'Type', template = 'plotly_dark')
fig.show()

##### **Mostly, paid apps with higher rates have smaller sizes, as they are developed for specific functions, unlike a game, for example.**

#### Correlations

In [None]:
filt = data.drop(['App', 'Category', 'Type', 'Content Rating', 'Genres'], axis = 1)

In [None]:
corr = filt.corr()

In [None]:
plt.figure(figsize = (10, 10))
sns.heatmap(corr, annot = True)

##### **There is a moderate positive relationship between the number of downloads and the number of reviews, that is, people tend to download a particular app according to the number of reviews, as this demonstrates the quality of the app. There is a positive but weak correlation between rating, size and downloads, that is, people do not care much about these aspects when downloading an app. The correlation between price and downloads is negative, that is, they are opposites. This means that the price does not influence the number of downloads, especially considering that most apps are free (to download).**

In [None]:
filt = np.log(filt)
fig = px.scatter(filt, x = 'Reviews', y = 'Installs', trendline = 'ols', template = 'plotly_dark')
fig.show()

#### Linear Regression

In [None]:
x = filt.iloc[:, 1].values

In [None]:
y = filt.iloc[:, 3].values

In [None]:
np.corrcoef(x, y)

In [None]:
x = x.reshape(-1, 1)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x, y)

In [None]:
predict = regressor.predict(x)
predict

In [None]:
fig = px.scatter(x = x.ravel(), y = y)
fig.add_scatter(x = x.ravel(), y = predict, name = 'Regression')
fig.show()

In [None]:
regressor.score(x, y)

In [None]:
from yellowbrick.regressor import ResidualsPlot
visu = ResidualsPlot(regressor)
visu.fit(x, y)
visu.poof()

##### **Confirmed! The number of downloads can be explained and predicted by the number of reviews.**

### Sentiment Analysis

In [None]:
playstore_user

In [None]:
playstore_user.describe()

In [None]:
playstore_user.info()

In [None]:
playstore_user['Translated_Review']

In [None]:
sns.heatmap(playstore_user.isnull())

In [None]:
playstore_user.isnull().sum()

In [None]:
playstore_user.dropna(axis = 0, inplace = True)

In [None]:
playstore_user.isnull().sum()

#### Sentiment reviews

In [None]:
fig = px.histogram(playstore_user, x = 'Sentiment', color = 'Sentiment', histnorm = 'density', template = 'plotly_dark')
fig.show()

#### Sentiment polarity distribution

In [None]:
fig = px.box(playstore_user, x = 'Sentiment', y = 'Sentiment_Polarity', color = 'Sentiment', template = 'plotly_dark')
fig.show()

### Wordcloud

In [None]:
positive = playstore_user[playstore_user['Sentiment'] == 'Positive']
positive

In [None]:
negative = playstore_user[playstore_user['Sentiment'] == 'Negative']
negative

In [None]:
sentences = playstore_user['Translated_Review'].tolist()
len(sentences)

In [None]:
sentences_one_string = ' '.join(sentences)
sentences_one_string

In [None]:
from wordcloud import WordCloud
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(sentences_one_string));

In [None]:
negative_list = negative['Translated_Review'].tolist()
negative_sentences = ' '.join(negative_list)

In [None]:
positive_list = positive['Translated_Review'].tolist()
positive_sentences = ' '.join(positive_list)

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(positive_sentences));

In [None]:
plt.figure(figsize=(20,20))
plt.imshow(WordCloud().generate(negative_sentences));

##### **We can see that most feelings refer to the 'Games' category, as it appears in a larger size. Positive words like great, good, love appear. As for the negative words, we have propaganda, hate, bad, malware.**

# Conclusion

##### **The average rating of apps on the Google Play Store is 4.19, where the amount of downloads is related to the amount of user reviews, mostly for free apps. Users prefer smaller sized apps whose main categories are Medical, Game and Family.**