In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv("googleplaystore.csv")
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [2]:
reviews = pd.read_csv("googleplaystore_user_reviews.csv")
reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
2,10 Best Foods for You,,,,
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3


In [3]:
reviews.shape

(64295, 5)

In [4]:
# Eliminem valors nuls
reviews.dropna(inplace=True)

# Eliminem una columna innecessària
reviews.drop("Translated_Review", axis=1, inplace=True)

# Canviem el nom de les columnes
reviews.columns = ["App", "Sentiment", "Polarity", "Subjectivity"]
reviews.shape

(37427, 4)

In [5]:
# Creem l'escala Likert
likert = {
  "Negative": -1,
  "Neutral": 0,
  "Positive": 1
}

# Transformem la columna Sentiment per a què coincideixi amb l'escala Likert
reviews.Sentiment = reviews.Sentiment.apply(lambda x: likert[x]).copy()

# Obtenim la mitjana de cada app agrupant les dades
reviews_mean = reviews.groupby("App").mean().copy()
reviews_mean.shape
reviews.head()

Unnamed: 0,App,Sentiment,Polarity,Subjectivity
0,10 Best Foods for You,1,1.0,0.533333
1,10 Best Foods for You,1,0.25,0.288462
3,10 Best Foods for You,1,0.4,0.875
4,10 Best Foods for You,1,1.0,0.3
5,10 Best Foods for You,1,1.0,0.3


In [6]:
complete_data = pd.merge(left=data, right=reviews_mean, on="App").copy()

# Eliminem els duplicats
complete_data.drop_duplicates("App", inplace=True)

# Es reestableix l'índex ja que tenim un nombre diferent de files
complete_data = complete_data.reset_index().drop("index", axis=1).copy()

# Seleccionem les columnes que s'usaran per l'anàlisi
columns = [0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15]
complete_data = complete_data.iloc[:,columns].copy()

complete_data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Sentiment,Polarity,Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,0.272727,0.152652,0.64154
1,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up,0.6,0.268483,0.523447
2,FlipaClip - Cartoon animation,ART_AND_DESIGN,4.3,194216,39M,"5,000,000+",Free,Everyone,Art & Design,"August 3, 2018",2.2.5,4.0.3 and up,1.0,0.412649,0.679226
3,Boys Photo Editor - Six Pack & Men's Suit,ART_AND_DESIGN,4.1,654,12M,"100,000+",Free,Everyone,Art & Design,"March 20, 2018",1.1,4.0.3 and up,0.473684,0.283509,0.479298
4,Colorfit - Drawing & Coloring,ART_AND_DESIGN,4.7,20260,25M,"500,000+",Free,Everyone,Art & Design;Creativity,"October 11, 2017",1.0.8,4.0.3 and up,0.552632,0.171836,0.572762


In [7]:
complete_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             816 non-null    object 
 1   Category        816 non-null    object 
 2   Rating          816 non-null    float64
 3   Reviews         816 non-null    object 
 4   Size            816 non-null    object 
 5   Installs        816 non-null    object 
 6   Type            816 non-null    object 
 7   Content Rating  816 non-null    object 
 8   Genres          816 non-null    object 
 9   Last Updated    816 non-null    object 
 10  Current Ver     816 non-null    object 
 11  Android Ver     816 non-null    object 
 12  Sentiment       816 non-null    float64
 13  Polarity        816 non-null    float64
 14  Subjectivity    816 non-null    float64
dtypes: float64(4), object(11)
memory usage: 95.8+ KB


In [8]:
complete_data.Installs = [int(obs.replace("+", "").replace(",", "")) for obs in complete_data.Installs]

In [9]:
complete_data.Rating = pd.to_numeric(complete_data.Rating).fillna(0).astype(np.float64)

In [10]:
complete_data['Size'] = complete_data['Size'].str.replace('M','000000',regex=True)
complete_data['Size'] = complete_data['Size'].str.replace('k','000',regex=True)
complete_data['Size'] = complete_data['Size'].replace('[^\d.]', '', regex = True)
complete_data['Size'] = pd.to_numeric(complete_data.Size).fillna(0).astype(np.int64)

In [11]:
complete_data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Content Rating,Genres,Last Updated,Current Ver,Android Ver,Sentiment,Polarity,Subjectivity
0,Coloring book moana,ART_AND_DESIGN,3.9,967,14000000,500000,Free,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,0.272727,0.152652,0.64154
1,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33000000,1000000,Free,Everyone,Art & Design,"September 20, 2017",2.9.2,3.0 and up,0.6,0.268483,0.523447
2,FlipaClip - Cartoon animation,ART_AND_DESIGN,4.3,194216,39000000,5000000,Free,Everyone,Art & Design,"August 3, 2018",2.2.5,4.0.3 and up,1.0,0.412649,0.679226
3,Boys Photo Editor - Six Pack & Men's Suit,ART_AND_DESIGN,4.1,654,12000000,100000,Free,Everyone,Art & Design,"March 20, 2018",1.1,4.0.3 and up,0.473684,0.283509,0.479298
4,Colorfit - Drawing & Coloring,ART_AND_DESIGN,4.7,20260,25000000,500000,Free,Everyone,Art & Design;Creativity,"October 11, 2017",1.0.8,4.0.3 and up,0.552632,0.171836,0.572762


In [None]:
complete_data.to_csv('final_data.csv')

In [None]:
reviews.to_csv('cleaned_reviews.csv')