In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix



# Load dataset

def load_dataset(path="googleplaystore.csv"):
    dataset = pd.read_csv(path)
    return dataset



# Summarize dataset

def print_summarize_dataset(dataset):
    print("Shape:", dataset.shape)
    print("\nColumns:", dataset.columns.tolist())
    print("\nMissing values:\n", dataset.isnull().sum())
    print("\nData types:\n", dataset.dtypes)
    print("\nHead:\n", dataset.head())



# Clean dataset

def clean_dataset(dataset):
    # Drop duplicates
    dataset = dataset.drop_duplicates()

    # Clean Installs → numeric
    dataset["Installs"] = (
        dataset["Installs"].astype(str).str.replace("[+,]", "", regex=True)
    )
    dataset["Installs"] = pd.to_numeric(dataset["Installs"], errors="coerce")

    # Clean Price → numeric
    dataset["Price"] = (
        dataset["Price"].astype(str).str.replace("$", "", regex=True)
    )
    dataset["Price"] = pd.to_numeric(dataset["Price"], errors="coerce")

    # Clean Size → numeric
    def size_to_num(x):
        x = str(x)
        if "M" in x:
            return float(x.replace("M", "")) * 1_000_000
        elif "k" in x:
            return float(x.replace("k", "")) * 1_000
        elif x == "Varies with device":
            return None
        else:
            try:
                return float(x)
            except:
                return None

    dataset["Size"] = dataset["Size"].apply(size_to_num)

    # Drop rows with missing or invalid values
    dataset = dataset.dropna()

    return dataset



# Histograms

def print_histograms(dataset):
    dataset.hist(bins=50, figsize=(20, 15))
    plt.tight_layout()
    plt.show()



# Correlation matrix

def compute_correlations_matrix(dataset):
    corr = dataset.corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
    plt.title("Correlation Matrix")
    plt.show()
    return corr



# Scatter matrix

def print_scatter_matrix(dataset):
    scatter_matrix(
        dataset[["Rating", "Reviews", "Size", "Installs", "Price"]],
        figsize=(15, 10),
        diagonal="kde"
    )
    plt.show()



# Main

if __name__ == "__main__":
    dataset = load_dataset("googleplaystore.csv")

    print("==== Before Cleaning ====")
    print_summarize_dataset(dataset)

    dataset = clean_dataset(dataset)

    print("\n==== After Cleaning ====")
    print_summarize_dataset(dataset)

    print("\n==== Histograms ====")
    print_histograms(dataset)

    print("\n==== Correlation Matrix ====")
    correlations = compute_correlations_matrix(dataset)

    print("\n==== Scatter Matrix ====")
    print_scatter_matrix(dataset)


ModuleNotFoundError: No module named 'seaborn'