In [342]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [343]:
# Read csv file containing the data
df = pd.read_csv('GooglePlaystoreDirty.csv')

In [344]:

# Obtain sample of data
df.sample(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
2907,EyeEm - Camera & Photo Filter,PHOTOGRAPHY,4.2,215343,44M,"10,000,000+",Free,0,Everyone,Photography,"July 14, 2018",6.4.3,4.0.3 and up
7752,Toughest Game Ever 2,GAME,4.6,293086,Varies with device,"5,000,000+",Free,0,Everyone,Action,"October 22, 2015",8.1,4.0.3 and up
6648,[Substratum] M5 Theme,PERSONALIZATION,4.4,16,6.7M,"5,000+",Free,0,Everyone,Personalization,"March 10, 2018",1.3.4,7.0 and up
5442,WGT Golf Game by Topgolf,SPORTS,4.3,148083,57M,"10,000,000+",Free,0,Everyone,Sports,"July 24, 2018",1.42.2,4.1 and up
1993,Jewels classic Prince,GAME,4.4,47644,15M,"5,000,000+",Free,0,Everyone,Puzzle,"June 15, 2018",2.4.2,4.0 and up


In [345]:
# Obtain info about dataset values
# Columuns have null values, and the datatype is incorrect for Reviews
df.info()
# Return column value count of empty or null values
df.isna().sum().sort_values(ascending=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


Rating            1474
Current Ver          8
Android Ver          3
Type                 1
Content Rating       1
App                  0
Category             0
Reviews              0
Size                 0
Installs             0
Price                0
Genres               0
Last Updated         0
dtype: int64

In [346]:
# Check ratings to make sure they are <=5
df['Rating'].describe()


count    9367.000000
mean        4.193338
std         0.537431
min         1.000000
25%         4.000000
50%         4.300000
75%         4.500000
max        19.000000
Name: Rating, dtype: float64

In [347]:
# Reviews are >5 so we will make these values NaN
df.loc[df['Rating']>5,'Rating'] = np.nan
# Fill these values with the average rating value instead
df['Rating'].fillna(df['Rating'].mean(), inplace = True)
# Round ratings to 2 decimals
df = df.round({'Rating':2})
# Recheck ratings
df['Rating'].describe()

count    10841.000000
mean         4.191518
std          0.478885
min          1.000000
25%          4.100000
50%          4.200000
75%          4.500000
max          5.000000
Name: Rating, dtype: float64

In [348]:
# Drop rows that contain any NaN values and recheck dataframe
df.dropna(inplace = True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  object 
 4   Size            10829 non-null  object 
 5   Installs        10829 non-null  object 
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  object 
 8   Content Rating  10829 non-null  object 
 9   Genres          10829 non-null  object 
 10  Last Updated    10829 non-null  object 
 11  Current Ver     10829 non-null  object 
 12  Android Ver     10829 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.2+ MB


In [349]:
# Change the reviews column to int64 since they are all numeric values
df['Reviews'] = pd.to_numeric(df['Reviews'])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10829 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10829 non-null  object 
 1   Category        10829 non-null  object 
 2   Rating          10829 non-null  float64
 3   Reviews         10829 non-null  int64  
 4   Size            10829 non-null  object 
 5   Installs        10829 non-null  object 
 6   Type            10829 non-null  object 
 7   Price           10829 non-null  object 
 8   Content Rating  10829 non-null  object 
 9   Genres          10829 non-null  object 
 10  Last Updated    10829 non-null  object 
 11  Current Ver     10829 non-null  object 
 12  Android Ver     10829 non-null  object 
dtypes: float64(1), int64(1), object(11)
memory usage: 1.2+ MB


In [350]:
# Return some of the duplicated apps and count how many total
df.loc[df.duplicated(subset=['App'], keep = False)].sort_values(by='App').head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
1393,10 Best Foods for You,HEALTH_AND_FITNESS,4.0,2490,3.8M,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up
1407,10 Best Foods for You,HEALTH_AND_FITNESS,4.0,2490,3.8M,"500,000+",Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up
2543,1800 Contacts - Lens Store,MEDICAL,4.7,23160,26M,"1,000,000+",Free,0,Everyone,Medical,"July 27, 2018",7.4.1,5.0 and up
2322,1800 Contacts - Lens Store,MEDICAL,4.7,23160,26M,"1,000,000+",Free,0,Everyone,Medical,"July 27, 2018",7.4.1,5.0 and up
2385,2017 EMRA Antibiotic Guide,MEDICAL,4.4,12,3.8M,"1,000+",Paid,$16.99,Everyone,Medical,"January 27, 2017",1.0.5,4.0.3 and up
2256,2017 EMRA Antibiotic Guide,MEDICAL,4.4,12,3.8M,"1,000+",Paid,$16.99,Everyone,Medical,"January 27, 2017",1.0.5,4.0.3 and up
1337,21-Day Meditation Experience,HEALTH_AND_FITNESS,4.4,11506,15M,"100,000+",Free,0,Everyone,Health & Fitness,"August 2, 2018",3.0.0,4.1 and up
1434,21-Day Meditation Experience,HEALTH_AND_FITNESS,4.4,11506,15M,"100,000+",Free,0,Everyone,Health & Fitness,"August 2, 2018",3.0.0,4.1 and up
3083,365Scores - Live Scores,SPORTS,4.6,666521,25M,"10,000,000+",Free,0,Everyone,Sports,"July 29, 2018",5.5.9,4.1 and up
5415,365Scores - Live Scores,SPORTS,4.6,666246,25M,"10,000,000+",Free,0,Everyone,Sports,"July 29, 2018",5.5.9,4.1 and up


In [351]:
df.duplicated(subset=['App'],keep = False).sum()

1979

In [352]:
# Sort by app and reviews with 
df.sort_values(by=['App','Reviews'],inplace=True)
# Delete the rows that have the least amount of reviews between the duplicated rows
df.drop_duplicates(subset=['App'], keep = 'last',inplace = True)

# Check if it worked
df.loc[df.duplicated(subset=['App'], keep = False)].sort_values(by='App').head(10)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver


In [353]:
# Check to make sure it saved the correct value using 365Scores - Live Scores from above. SHould have 666521 reviews
df.loc[df['App']== '365Scores - Live Scores']

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
3083,365Scores - Live Scores,SPORTS,4.6,666521,25M,"10,000,000+",Free,0,Everyone,Sports,"July 29, 2018",5.5.9,4.1 and up


In [354]:
# Format category columns
df['Category'].value_counts

<bound method IndexOpsMixin.value_counts of 8884           SPORTS
8532           SOCIAL
324            COMICS
4541            TOOLS
4636    COMMUNICATION
            ...      
6334    VIDEO_PLAYERS
4362        LIFESTYLE
2575           SOCIAL
7559            TOOLS
882     ENTERTAINMENT
Name: Category, Length: 9648, dtype: object>

In [355]:
# Replace _ with blank space and normalize the capitalization
df['Category'] = df['Category'].str.replace('_',' ')
df['Category'] = df['Category'].str.capitalize()
df['Category'].value_counts

<bound method IndexOpsMixin.value_counts of 8884           Sports
8532           Social
324            Comics
4541            Tools
4636    Communication
            ...      
6334    Video players
4362        Lifestyle
2575           Social
7559            Tools
882     Entertainment
Name: Category, Length: 9648, dtype: object>

In [356]:
# Clean installs column and convert to numeric value type
df['Installs'] = df['Installs'].astype(str)
df['Installs'] = pd.to_numeric(df['Installs'].str.replace('+','',regex = False).str.replace(',','',regex = False))
df.head(10)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.19,27,3.6M,500,Free,0,Everyone,Sports,"October 7, 2017",0.22,4.1 and up
8532,+Download 4 Instagram Twitter,Social,4.5,40467,22M,1000000,Free,0,Everyone,Social,"August 2, 2018",5.03,4.1 and up
324,- Free Comics - Comic Apps,Comics,3.5,115,9.1M,10000,Free,0,Mature 17+,Comics,"July 13, 2018",5.0.12,5.0 and up
4541,.R,Tools,4.5,259,203k,10000,Free,0,Everyone,Tools,"September 16, 2014",1.1.06,1.5 and up
4636,/u/app,Communication,4.7,573,53M,10000,Free,0,Mature 17+,Communication,"July 3, 2018",4.2.4,4.1 and up
5940,058.ba,News and magazines,4.4,27,14M,100,Free,0,Everyone,News & Magazines,"July 6, 2018",1.0,4.2 and up
10252,1. FC Köln App,Sports,4.6,2019,41M,100000,Free,0,Everyone,Sports,"July 20, 2018",1.13.0,4.4 and up
1407,10 Best Foods for You,Health and fitness,4.0,2490,3.8M,500000,Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up
8483,10 Minutes a Day Times Tables,Family,4.1,681,48M,100000,Free,0,Everyone,Education,"July 3, 2014",1.2,2.2 and up
7738,10 WPM Amateur ham radio CW Morse code trainer,Communication,3.5,10,3.8M,100,Paid,$1.49,Everyone,Communication,"May 12, 2018",2.1.4,2.1 and up


In [357]:
# Clean size column by replacing "Varies with device" to 0 and removing the k and M and converting to bytes
df['Size'] = df['Size'].str.replace("Varies with device", "0")
# Convert k to bytes by multiplying by 1024
# Assign to columns that had k
df.loc[df['Size'].str.contains('k'),'Size'] = (pd.to_numeric(df.loc[df['Size'].str.contains('k'), 'Size'].str.replace('k',''))*1024).astype(str)
# Check that it worked
df.head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.19,27,3.6M,500,Free,0,Everyone,Sports,"October 7, 2017",0.22,4.1 and up
8532,+Download 4 Instagram Twitter,Social,4.5,40467,22M,1000000,Free,0,Everyone,Social,"August 2, 2018",5.03,4.1 and up
324,- Free Comics - Comic Apps,Comics,3.5,115,9.1M,10000,Free,0,Mature 17+,Comics,"July 13, 2018",5.0.12,5.0 and up
4541,.R,Tools,4.5,259,207872.0,10000,Free,0,Everyone,Tools,"September 16, 2014",1.1.06,1.5 and up
4636,/u/app,Communication,4.7,573,53M,10000,Free,0,Mature 17+,Communication,"July 3, 2018",4.2.4,4.1 and up
5940,058.ba,News and magazines,4.4,27,14M,100,Free,0,Everyone,News & Magazines,"July 6, 2018",1.0,4.2 and up
10252,1. FC Köln App,Sports,4.6,2019,41M,100000,Free,0,Everyone,Sports,"July 20, 2018",1.13.0,4.4 and up
1407,10 Best Foods for You,Health and fitness,4.0,2490,3.8M,500000,Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up
8483,10 Minutes a Day Times Tables,Family,4.1,681,48M,100000,Free,0,Everyone,Education,"July 3, 2014",1.2,2.2 and up
7738,10 WPM Amateur ham radio CW Morse code trainer,Communication,3.5,10,3.8M,100,Paid,$1.49,Everyone,Communication,"May 12, 2018",2.1.4,2.1 and up


In [358]:
# Repeat the process for M but you must multiply by 1024*1024
# Assign to columns that had M
df.loc[df['Size'].str.contains('M',na=False),'Size'] = (pd.to_numeric(df.loc[df['Size'].str.contains('M',na=False), 'Size'].str.replace('M',''))*(1024*1024)).astype(str)
# Check that it worked
df.head(10)


Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.19,27,3774873.6,500,Free,0,Everyone,Sports,"October 7, 2017",0.22,4.1 and up
8532,+Download 4 Instagram Twitter,Social,4.5,40467,23068672.0,1000000,Free,0,Everyone,Social,"August 2, 2018",5.03,4.1 and up
324,- Free Comics - Comic Apps,Comics,3.5,115,9542041.6,10000,Free,0,Mature 17+,Comics,"July 13, 2018",5.0.12,5.0 and up
4541,.R,Tools,4.5,259,207872.0,10000,Free,0,Everyone,Tools,"September 16, 2014",1.1.06,1.5 and up
4636,/u/app,Communication,4.7,573,55574528.0,10000,Free,0,Mature 17+,Communication,"July 3, 2018",4.2.4,4.1 and up
5940,058.ba,News and magazines,4.4,27,14680064.0,100,Free,0,Everyone,News & Magazines,"July 6, 2018",1.0,4.2 and up
10252,1. FC Köln App,Sports,4.6,2019,42991616.0,100000,Free,0,Everyone,Sports,"July 20, 2018",1.13.0,4.4 and up
1407,10 Best Foods for You,Health and fitness,4.0,2490,3984588.8,500000,Free,0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up
8483,10 Minutes a Day Times Tables,Family,4.1,681,50331648.0,100000,Free,0,Everyone,Education,"July 3, 2014",1.2,2.2 and up
7738,10 WPM Amateur ham radio CW Morse code trainer,Communication,3.5,10,3984588.8,100,Paid,$1.49,Everyone,Communication,"May 12, 2018",2.1.4,2.1 and up


In [359]:
# Convert column to numeric
df['Size'] = pd.to_numeric(df['Size'])

In [360]:
# Convert price column to numeric and clean it
df['Price']=pd.to_numeric(df['Price'].str.replace('$','',regex=False))
df.head(10)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
8884,"""i DT"" Fútbol. Todos Somos Técnicos.",Sports,4.19,27,3774873.6,500,Free,0.0,Everyone,Sports,"October 7, 2017",0.22,4.1 and up
8532,+Download 4 Instagram Twitter,Social,4.5,40467,23068672.0,1000000,Free,0.0,Everyone,Social,"August 2, 2018",5.03,4.1 and up
324,- Free Comics - Comic Apps,Comics,3.5,115,9542041.6,10000,Free,0.0,Mature 17+,Comics,"July 13, 2018",5.0.12,5.0 and up
4541,.R,Tools,4.5,259,207872.0,10000,Free,0.0,Everyone,Tools,"September 16, 2014",1.1.06,1.5 and up
4636,/u/app,Communication,4.7,573,55574528.0,10000,Free,0.0,Mature 17+,Communication,"July 3, 2018",4.2.4,4.1 and up
5940,058.ba,News and magazines,4.4,27,14680064.0,100,Free,0.0,Everyone,News & Magazines,"July 6, 2018",1.0,4.2 and up
10252,1. FC Köln App,Sports,4.6,2019,42991616.0,100000,Free,0.0,Everyone,Sports,"July 20, 2018",1.13.0,4.4 and up
1407,10 Best Foods for You,Health and fitness,4.0,2490,3984588.8,500000,Free,0.0,Everyone 10+,Health & Fitness,"February 17, 2017",1.9,2.3.3 and up
8483,10 Minutes a Day Times Tables,Family,4.1,681,50331648.0,100000,Free,0.0,Everyone,Education,"July 3, 2014",1.2,2.2 and up
7738,10 WPM Amateur ham radio CW Morse code trainer,Communication,3.5,10,3984588.8,100,Paid,1.49,Everyone,Communication,"May 12, 2018",2.1.4,2.1 and up
