# Google Play Store Apps

## Цель:
* Привести данные в удобный формат для дальнейшего анализа

## Ход работы:
* Приведем названия столбцов в строчные буквы и уберем пробелы
* Очистим данные от пустых значений 
* Приведем столбец `Size` к более удобному формату 
* Приведем столбцы `Reviews`,`Installs` в int 
* Приведем столбец `Price` в float
* Столбец `Last Updated` в дату 
* Превратим `genres ` в список 

In [117]:
import pandas as pd
from datetime import datetime

In [118]:
df = pd.read_csv('googleplaystore.csv')
df.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Приведем названия столбцов в строчные буквы и уберем пробелы

In [119]:
df.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [120]:
new_list = [ x.lower().replace(" ", "_") for x in df.columns]
new_list

['app',
 'category',
 'rating',
 'reviews',
 'size',
 'installs',
 'type',
 'price',
 'content_rating',
 'genres',
 'last_updated',
 'current_ver',
 'android_ver']

In [121]:
df = df.rename(columns=dict(zip(df.columns,new_list)))
df.head()

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


### Очистим данные от пустых значений 

In [122]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app             10841 non-null  object 
 1   category        10841 non-null  object 
 2   rating          9367 non-null   float64
 3   reviews         10841 non-null  object 
 4   size            10841 non-null  object 
 5   installs        10841 non-null  object 
 6   type            10840 non-null  object 
 7   price           10841 non-null  object 
 8   content_rating  10840 non-null  object 
 9   genres          10841 non-null  object 
 10  last_updated    10841 non-null  object 
 11  current_ver     10833 non-null  object 
 12  android_ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [123]:
df.isna().sum()

app                  0
category             0
rating            1474
reviews              0
size                 0
installs             0
type                 1
price                0
content_rating       1
genres               0
last_updated         0
current_ver          8
android_ver          3
dtype: int64

У столбцов `type`,`content_rating`,`current_ver`,`android_ver` удалим строки с пропущенными значениями, так как их немного. Что делать с `rating` зависит от задач, можно удалить строки с пропущенными значениями, а можно воспользоваться алгоритмами машинного обучения для заполнения пропусков. 

In [124]:
df = df.dropna(subset=['type','content_rating','current_ver','android_ver']).reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10829 entries, 0 to 10828
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app             10829 non-null  object 
 1   category        10829 non-null  object 
 2   rating          9360 non-null   float64
 3   reviews         10829 non-null  object 
 4   size            10829 non-null  object 
 5   installs        10829 non-null  object 
 6   type            10829 non-null  object 
 7   price           10829 non-null  object 
 8   content_rating  10829 non-null  object 
 9   genres          10829 non-null  object 
 10  last_updated    10829 non-null  object 
 11  current_ver     10829 non-null  object 
 12  android_ver     10829 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


### Преведем данные столбца `size` в удобный формат

In [125]:
df['size'].unique()

array(['19M', '14M', '8.7M', '25M', '2.8M', '5.6M', '29M', '33M', '3.1M',
       '28M', '12M', '20M', '21M', '37M', '5.5M', '17M', '39M', '31M',
       '4.2M', '7.0M', '23M', '6.0M', '6.1M', '4.6M', '9.2M', '5.2M',
       '11M', '24M', 'Varies with device', '9.4M', '15M', '10M', '1.2M',
       '26M', '8.0M', '7.9M', '56M', '57M', '35M', '54M', '201k', '3.6M',
       '5.7M', '8.6M', '2.4M', '27M', '2.7M', '2.5M', '16M', '3.4M',
       '8.9M', '3.9M', '2.9M', '38M', '32M', '5.4M', '18M', '1.1M',
       '2.2M', '4.5M', '9.8M', '52M', '9.0M', '6.7M', '30M', '2.6M',
       '7.1M', '3.7M', '22M', '7.4M', '6.4M', '3.2M', '8.2M', '9.9M',
       '4.9M', '9.5M', '5.0M', '5.9M', '13M', '73M', '6.8M', '3.5M',
       '4.0M', '2.3M', '7.2M', '2.1M', '42M', '7.3M', '9.1M', '55M',
       '23k', '6.5M', '1.5M', '7.5M', '51M', '41M', '48M', '8.5M', '46M',
       '8.3M', '4.3M', '4.7M', '3.3M', '40M', '7.8M', '8.8M', '6.6M',
       '5.1M', '61M', '66M', '79k', '8.4M', '118k', '44M', '695k', '1.6M',
     

In [126]:
(df['size']=='Varies with device').sum()

1694

In [127]:
((df['size']=='Varies with device').sum())/(df.count())[0]

0.15643180349062702

Имеетстя довольно большое количество (более 15%) значений `Varies with device`. Что делать опять таки зависит от конкретной задачи. Можно удалить, можно заполнить, можно оставить как есть. Сделаем третий вариант. Создадим новый столбец `size(k)`, в котором известные числа будут переведены в формат тысяч, оставшиеся строки оставим как есть. 

In [128]:
def transform_size(x):
    if str(x)[-1]=='k':
        x = int(float(str(x)[:-1]))
    elif str(x)[-1]=='M':
        x = int(float(str(x)[:-1])*1000)
    else:
        x = x
    return x

In [129]:
df['size(k)']=df['size'].apply(transform_size)
df.head(3)

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver,size(k)
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14000
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8700


### Приведем данные столбца `reviews` к типу int

In [130]:
df['reviews']=df['reviews'].apply([lambda x: int(x)])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10829 entries, 0 to 10828
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app             10829 non-null  object 
 1   category        10829 non-null  object 
 2   rating          9360 non-null   float64
 3   reviews         10829 non-null  int64  
 4   size            10829 non-null  object 
 5   installs        10829 non-null  object 
 6   type            10829 non-null  object 
 7   price           10829 non-null  object 
 8   content_rating  10829 non-null  object 
 9   genres          10829 non-null  object 
 10  last_updated    10829 non-null  object 
 11  current_ver     10829 non-null  object 
 12  android_ver     10829 non-null  object 
 13  size(k)         10829 non-null  object 
dtypes: float64(1), int64(1), object(12)
memory usage: 1.2+ MB


### Приведем данные столбца `installs` к типу int

In [131]:
df['installs'].unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+'], dtype=object)

In [132]:
df['installs(+)']=df['installs'].apply([lambda x: int(str(x)[:-1].replace(",", ""))])
df.head(3)

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver,size(k),installs(+)
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up,19000,10000
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up,14000,500000
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up,8700,5000000


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10829 entries, 0 to 10828
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   app             10829 non-null  object 
 1   category        10829 non-null  object 
 2   rating          9360 non-null   float64
 3   reviews         10829 non-null  int64  
 4   size            10829 non-null  object 
 5   installs        10829 non-null  object 
 6   type            10829 non-null  object 
 7   price           10829 non-null  object 
 8   content_rating  10829 non-null  object 
 9   genres          10829 non-null  object 
 10  last_updated    10829 non-null  object 
 11  current_ver     10829 non-null  object 
 12  android_ver     10829 non-null  object 
 13  size(k)         10829 non-null  object 
 14  installs(+)     10829 non-null  int64  
dtypes: float64(1), int64(2), object(12)
memory usage: 1.2+ MB


### Приведем данные столбца `price` к типу float

In [134]:
df.price.unique()

array(['0', '$4.99', '$3.99', '$6.99', '$1.49', '$2.99', '$7.99', '$5.99',
       '$3.49', '$1.99', '$9.99', '$7.49', '$0.99', '$9.00', '$5.49',
       '$10.00', '$24.99', '$11.99', '$79.99', '$16.99', '$14.99',
       '$1.00', '$29.99', '$12.99', '$2.49', '$10.99', '$1.50', '$19.99',
       '$15.99', '$33.99', '$74.99', '$39.99', '$3.95', '$4.49', '$1.70',
       '$8.99', '$2.00', '$3.88', '$25.99', '$399.99', '$17.99',
       '$400.00', '$3.02', '$1.76', '$4.84', '$4.77', '$1.61', '$2.50',
       '$1.59', '$6.49', '$1.29', '$5.00', '$13.99', '$299.99', '$379.99',
       '$37.99', '$18.99', '$389.99', '$19.90', '$8.49', '$1.75',
       '$14.00', '$4.85', '$46.99', '$109.99', '$154.99', '$3.08',
       '$2.59', '$4.80', '$1.96', '$19.40', '$3.90', '$4.59', '$15.46',
       '$3.04', '$4.29', '$2.60', '$3.28', '$4.60', '$28.99', '$2.95',
       '$2.90', '$1.97', '$200.00', '$89.99', '$2.56', '$30.99', '$3.61',
       '$394.99', '$1.26', '$1.20', '$1.04'], dtype=object)

In [135]:
df['price($)']=df['price'].apply([lambda x: float(str(x).replace('$',''))])
df[df['type']!='Free'].head(3)

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver,size(k),installs(+),price($)
233,TurboScan: scan documents and receipts in PDF,BUSINESS,4.7,11442,6.8M,"100,000+",Paid,$4.99,Everyone,Business,"March 25, 2018",1.5.2,4.0 and up,6800,100000,4.99
234,Tiny Scanner Pro: PDF Doc Scan,BUSINESS,4.8,10295,39M,"100,000+",Paid,$4.99,Everyone,Business,"April 11, 2017",3.4.6,3.0 and up,39000,100000,4.99
289,TurboScan: scan documents and receipts in PDF,BUSINESS,4.7,11442,6.8M,"100,000+",Paid,$4.99,Everyone,Business,"March 25, 2018",1.5.2,4.0 and up,6800,100000,4.99


### Приведем данные столбца `last_updated`  в формат даты

In [136]:
df['last_updated']=df['last_updated'].apply([lambda x: datetime.strptime(x, '%B %d, %Y')])
df

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver,size(k),installs(+),price($)
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,2018-01-07,1.0.0,4.0.3 and up,19000,10000,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,2018-01-15,2.0.0,4.0.3 and up,14000,500000,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,2018-08-01,1.2.4,4.0.3 and up,8700,5000000,0.0
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,2018-06-08,Varies with device,4.2 and up,25000,50000000,0.0
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,2018-06-20,1.1,4.4 and up,2800,100000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10824,Sya9a Maroc - FR,FAMILY,4.5,38,53M,"5,000+",Free,0,Everyone,Education,2017-07-25,1.48,4.1 and up,53000,5000,0.0
10825,Fr. Mike Schmitz Audio Teachings,FAMILY,5.0,4,3.6M,100+,Free,0,Everyone,Education,2018-07-06,1.0,4.1 and up,3600,100,0.0
10826,Parkinson Exercices FR,MEDICAL,,3,9.5M,"1,000+",Free,0,Everyone,Medical,2017-01-20,1.0,2.2 and up,9500,1000,0.0
10827,The SCP Foundation DB fr nn5n,BOOKS_AND_REFERENCE,4.5,114,Varies with device,"1,000+",Free,0,Mature 17+,Books & Reference,2015-01-19,Varies with device,Varies with device,Varies with device,1000,0.0


### Превратим `genres ` в список 

In [137]:
df['genres'] = df['genres'].apply(lambda x: list(map(str, x.split(';'))))
df.head(3)

Unnamed: 0,app,category,rating,reviews,size,installs,type,price,content_rating,genres,last_updated,current_ver,android_ver,size(k),installs(+),price($)
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,[Art & Design],2018-01-07,1.0.0,4.0.3 and up,19000,10000,0.0
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,"[Art & Design, Pretend Play]",2018-01-15,2.0.0,4.0.3 and up,14000,500000,0.0
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,[Art & Design],2018-08-01,1.2.4,4.0.3 and up,8700,5000000,0.0
