### 1. ACQUISITION

In [258]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [226]:
data=pd.read_csv('/Users/almu/ironhack/datamad0119/module-1/pipelines-project/your-code/data-folder/googleplaystore.csv')

In [227]:
data.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


## 2. WRANGLING

### Eliminamos duplicados

In [228]:
before = len(data)
data = data.drop_duplicates()
after = len(data)
before,after

(10841, 10358)

In [246]:
data.drop_duplicates('App', keep="last", inplace=True)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price_dollars,Content_Rating,Current_Ver,Android_Ver,Updating_Year


### Renombramos columnas

In [232]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [267]:
data=data.rename(columns={'Content Rating':'Content_Rating', 'Last Updated':'Last_Updated','Current Ver':'Current_Ver','Android Ver':'Android_Ver'})

### Comprobamos valores nulos

No eliminamos porque no hay muchos

In [234]:
null_cols = data.isnull().sum()
null_cols[null_cols > 0]

Rating            1465
Type                 1
Content_Rating       1
Current_Ver          8
Android_Ver          3
dtype: int64

### Eliminamos columnas innecesarias

'Genres' contiene misma info que 'Category'

In [235]:
data=data.drop('Genres', axis=1)
data.head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price_dollars,Content_Rating,Last_Updated,Current_Ver,Android_Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,"June 20, 2018",1.1,4.4 and up
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,"50,000+",Free,0,Everyone,"March 26, 2017",1.0,2.3 and up
6,Smoke Effect Photo Maker - Smoke Editor,ART_AND_DESIGN,3.8,178,19M,"50,000+",Free,0,Everyone,"April 26, 2018",1.1,4.0.3 and up
7,Infinite Painter,ART_AND_DESIGN,4.1,36815,29M,"1,000,000+",Free,0,Everyone,"June 14, 2018",6.1.61.1,4.2 and up
8,Garden Coloring Book,ART_AND_DESIGN,4.4,13791,33M,"1,000,000+",Free,0,Everyone,"September 20, 2017",2.9.2,3.0 and up
9,Kids Paint Free - Drawing Fun,ART_AND_DESIGN,4.7,121,3.1M,"10,000+",Free,0,Everyone,"July 3, 2018",2.8,4.0.3 and up


### Cambiamos valores de varias columnas

In [236]:
data.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price_dollars      object
Content_Rating     object
Last_Updated       object
Current_Ver        object
Android_Ver        object
dtype: object

De la columna Last_updated dejamos solo el año

In [237]:
new = data['Last_Updated'].str.split(",", n = 1, expand = True)
data['Updating_Year']=new[1]
data=data.drop('Last_Updated', axis=1)



Modificamos columnas Installs,Android_Ver y Price

In [238]:
data['Installs']=data['Installs'].str.replace("+","")
data['Android_Ver']=data['Android_Ver'].str.replace(" and up","")
data['Price_dollars']=data['Price_dollars'].str.replace("$","")

## 3. ANALYSIS

### H1: las aplicaciones gratis son las que tienen más reviews por app pero las de pago tienen un mayor número de instalaciones por app

In [239]:
data=data.drop(data[data['Reviews']=='3.0M'].index)

In [240]:
data['Reviews']=data['Reviews'].astype(int)

In [241]:
data['Installs'].value_counts()
data.loc[:,'Installs'] = data.loc[:,'Installs'].str.replace(',', '', regex=False).astype(int)


In [192]:
data.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs            int64
Type               object
Price_dollars      object
Content_Rating     object
Current_Ver        object
Android_Ver        object
Updating_Year      object
dtype: object

In [253]:
data.head()


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price_dollars,Content_Rating,Current_Ver,Android_Ver,Updating_Year
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,10000,Free,0,Everyone,1.0.0,4.0.3,2018
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,5000000,Free,0,Everyone,1.2.4,4.0.3,2018
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,50000000,Free,0,Teen,Varies with device,4.2,2018
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,100000,Free,0,Everyone,1.1,4.4,2018
5,Paper flowers instructions,ART_AND_DESIGN,4.4,167,5.6M,50000,Free,0,Everyone,1.0,2.3,2017


In [197]:
H1=pd.pivot_table(data,values=['Reviews','Installs'], index='Type',aggfunc={'Reviews':'mean', 'Installs':'mean'})
H1

Unnamed: 0_level_0,Installs,Reviews
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Free,15281270.0,437373.593056
Paid,90491.35,11900.550327


### H2: las actualizaciones periódicas de las app no afectan en el rating de las app 

In [209]:
#H2=pd.pivot_table(data[(data['Reviews']>100000)],values='Rating', index='Updating_Year',aggfunc='mean')
#H2

Unnamed: 0_level_0,Rating
Updating_Year,Unnamed: 1_level_1
2011,4.4
2012,3.5
2013,4.328571
2014,4.315385
2015,4.319048
2016,4.186667
2017,4.28125
2018,4.381869


### C1: Top 5 mejor valoradas con más de 5m de reviews gratis

In [247]:
data2=data[(data['Reviews']>50000000)&(data['Type']=='Free')].sort_values(by='Rating', ascending=False).head(5)
data2

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price_dollars,Content_Rating,Current_Ver,Android_Ver,Updating_Year
3909,Instagram,SOCIAL,4.5,66509917,Varies with device,1000000000,Free,0,Teen,Varies with device,Varies with device,2018
3904,WhatsApp Messenger,COMMUNICATION,4.4,69109672,Varies with device,1000000000,Free,0,Everyone,Varies with device,Varies with device,2018
3943,Facebook,SOCIAL,4.1,78128208,Varies with device,1000000000,Free,0,Teen,Varies with device,Varies with device,2018
382,Messenger – Text and Video Chat for Free,COMMUNICATION,4.0,56646578,Varies with device,1000000000,Free,0,Everyone,Varies with device,Varies with device,2018
