# Importing Libraries

In [None]:
#importing neccessary libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

# Importing the data

In [None]:
#uploading the dataset
google_app = pd.read_csv(r"C:\Users\EliteBook\Desktop\WTCF23\Data Set\googleplaystore.csv")
google_app.head()

# Changing the column names to lower case

In [None]:
google_app.columns = [element.lower() for element in google_app.columns]

# Exploring the data

In [None]:
#Checking the the number of rows and columns in our data (row, column)
google_app.shape

In [None]:
#defining a function to get info on our data into a new dataframe for simplicity
def get_info():
    info_df = pd.DataFrame(index = google_app.columns)
    info_df["null values"] = google_app.isnull().sum()
    info_df["unique"] = google_app.nunique()
    info_df["duplicates"] = google_app.duplicated().sum()
    info_df["size"] = google_app.size
    info_df["data types"] = google_app.dtypes
    return info_df
get_info()

# Data Cleaning

## Removing duplicates

In [None]:
#code to drop duplicates
google_app.drop_duplicates(inplace = True)

In [None]:
#calling the get_info() function to confirm duplicates are off
get_info()

## Checking and Removing special characters

In [None]:
#checking for special characters in the installs column
google_app["installs"].str.isalpha().unique()

In [None]:
#removing special characters from the installs column
google_app["installs"] = google_app["installs"].replace("\W", "", regex = True)

#calling the unique items in the installs to confirm the special characters has been taken off
google_app["installs"].unique()

In [None]:
#checking for special characters in the price column
google_app["price"].str.isalpha().unique()

In [None]:
#removing special characters from the installs column
google_app["price"] = google_app["price"].replace("\W", "", regex = True)

#calling the unique items in the price column to confirm the special characters has been taken off
google_app["price"].unique()

In [None]:
#checking for special characters in the size column
#Note that there is a module in pandas called .size so using google_app.size will give an error for the size column
google_app["size"].str.isalpha().unique()

In [None]:
#checking for special characters in the reviews column
google_app["reviews"].str.isalpha().unique()

In [None]:
get_info()

# Handling NaN

In [None]:
google_app.query("rating.isnull() &" "installs.notnull()")

In [None]:
#checking for unique values present in the type column
google_app["type"].unique()

In [None]:
#taking a look at the type index with NaN
google_app[google_app["type"].isnull()]


#from the output below reviews, price and installs shows zero which can be inferred that the app was not downloaded by users,

In [None]:
#dropping NaN for the type column
google_app.drop(index = 9148, inplace = True)

In [None]:
#checking for unique values present in the content rating column
google_app["content rating"].unique()

In [None]:
#taking a look at the content rating index with NaN
google_app[google_app["content rating"].isnull()]

#the row in the output below looks shifted

In [None]:
#df subsetting
#google_app[google_app["category"] == "Life Made WI-Fi Touchscreen Photo Frame"]

In [None]:
#dropping NaN for the content rating column
google_app.drop(index = 10472, inplace = True)

In [None]:
#gives you a list of the indexes
#google_app[google_app["current ver"].isnull()].index.tolist()

In [None]:
#checking for unique values present in the current ver column
google_app["current ver"].unique()

In [None]:
#taking a look at the current ver index with NaN
google_app[google_app["current ver"].isnull()]

In [None]:
#checking for unique values present in the android ver column
google_app["android ver"].unique()

In [None]:
#taking a look at the android ver index with NaN
google_app[google_app["android ver"].isnull()]

In [None]:
#Since both current ver & android ver are objects we can fill in their NaNs with a string
google_app.fillna({"current ver" : "N/A", "android ver" : "N/A"}, inplace = True)

In [None]:
#checking for unique values present in the rating column
google_app["rating"].unique()

In [None]:
#I will use zero for all NaN in the rating column since there is over 1000 rows with NaN, dropping it may not be the best 
google_app["rating"].fillna(0, inplace = True)

In [None]:
get_info()

# Changing the data type for some columns

In [None]:
google_app["installs"] = google_app["installs"].astype(int)

In [None]:
google_app["price"] = google_app["price"].astype(int)

In [None]:
google_app["reviews"] = google_app["reviews"].astype(int)

In [None]:
google_app["last updated"] = pd.to_datetime(google_app["last updated"])
#google_app["last updated"].dt.strftime('%d%m%y')

In [None]:
google_app

In [None]:
get_info()

In [None]:
#checking the unique values present in the size column
google_app["size"].unique()

In [None]:
#converting M to megabyte
google_app["size"] = google_app["size"].apply(lambda x:x.replace("M", "e+6"))

In [None]:
#converting k to kilobyte
google_app["size"] = google_app["size"].apply(lambda x:x.replace("k", "e-3"))

In [None]:
#replacing 'Varies with device' with NaN
google_app["size"] = google_app["size"].replace("Varies with device", np.NaN)

In [None]:
#confirming the size column after imputing NaN
get_info()

In [None]:
#calling the google_app df to see NaN in the size column
google_app[google_app["size"].isnull()]

In [None]:
#dropping the null values in the size column as replacing with the mean or mode or median may not be ideal as some apps though
#with high or low rating may have large or small size
google_app.dropna(subset = ["size"], inplace = True)

In [None]:
#confirming NaN in the size column has been dropped by calling the get_info function
get_info()

In [None]:
#converting the size column to numeric
google_app["size"] = pd.to_numeric(google_app["size"])

In [None]:
google_app["size"].unique()

In [None]:
#saving the cleaned data to in csv format
google_app.to_csv("google_play_store_dataset_cleaned.csv")