Data profiling is the process of examining and analyzing data to understand its structure, quality, and content. It involves extracting metadata and statistics from a dataset to gain insights into its characteristics and properties. Data profiling is an essential step in data preparation and analysis, especially in data-driven applications and decision-making processes.

In [6]:
!pip install dataprofiler


Collecting dataprofiler
  Downloading DataProfiler-0.10.9-py3-none-any.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
Collecting fastavro>=1.0.0.post1 (from dataprofiler)
  Downloading fastavro-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-snappy>=0.5.4 (from dataprofiler)
  Downloading python_snappy-0.7.1-py3-none-any.whl (8.6 kB)
Collecting HLL>=2.0.3 (from dataprofiler)
  Downloading HLL-2.0.3.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasketches>=4.1.0 (from dataprofiler)
  Downloading datasketches-5.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (678 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m678.5/678.5 kB[0m [31m42.5 MB/s[0m eta [36m0:00:00[0

In [28]:
import json
import pandas as pd
from dataprofiler import Data, Profiler

In [25]:
# Loading a File
data = Data("/content/googleplaystore.csv") # Auto-Detect & Load: CSV, AVRO, Parquet, JSON, Text
data.data.head(5) # Access data directly via a compatible Pandas DataFrame

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [26]:
# Sort the data by the neighbourhood column
host = data.data.sort_values(by='Category')
host.head(5)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
10472,Life Made WI-Fi Touchscreen Photo Frame,1.9,19.0,3.0M,"1,000+",Free,0,Everyone,,"February 11, 2018",1.0.19,4.0 and up,
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
35,How to draw Ladybug and Cat Noir,ART_AND_DESIGN,3.8,564,9.2M,"100,000+",Free,0,Everyone,Art & Design,"July 11, 2018",2.1,4.1 and up
36,UNICORN - Color By Number & Pixel Art Coloring,ART_AND_DESIGN,4.7,8145,24M,"500,000+",Free,0,Everyone,Art & Design;Creativity,"August 2, 2018",1.0.9,4.4 and up
37,Floor Plan Creator,ART_AND_DESIGN,4.1,36639,Varies with device,"5,000,000+",Free,0,Everyone,Art & Design,"July 14, 2018",Varies with device,2.3.3 and up


In [13]:
print(data.data.shape[0])
len(host[host['Category']!=""])

10841


10841

In [14]:
# Profile the dataset
profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc

# Generate a report and use json to prettify.
report  = profile.report(report_options={"output_format":"pretty"})

# Print the report
print(json.dumps(report, indent=4))

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 


INFO:DataProfiler.profilers.profile_builder:Finding the Null values in the columns... 
100%|██████████| 13/13 [00:00<00:00, 85.13it/s]

INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 



INFO:DataProfiler.profilers.profile_builder:Calculating the statistics... 
100%|██████████| 13/13 [00:04<00:00,  2.67it/s]

{
    "global_stats": {
        "samples_used": 5000,
        "column_count": 13,
        "row_count": 10841,
        "row_has_null_ratio": 0.136,
        "row_is_null_ratio": 0.0,
        "unique_row_ratio": 0.9554,
        "duplicate_row_count": 483,
        "file_type": "csv",
        "encoding": "utf-8",
        "correlation_matrix": null,
        "chi2_matrix": "[[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], ... , [nan,  0.,  0., nan,  0.,  0.,  0.,  0.,  0.,  0., nan, nan,  1.]]",
        "profile_schema": {
            "App": [
                0
            ],
            "Category": [
                1
            ],
            "Rating": [
                2
            ],
            "Reviews": [
                3
            ],
            "Size": [
                4
            ],
            "Installs": [
                5
            ],
            "Type": [
                6
            ],
            "Price": [
                7
            ],
      




In [17]:
profile = Profiler(data) # Calculate Statistics, Entity Recognition, etc

INFO:DataProfiler.profilers.profile_builder: Finding the Null values in the columns... 


INFO:DataProfiler.profilers.profile_builder:Finding the Null values in the columns... 
100%|██████████| 13/13 [00:00<00:00, 80.39it/s]

INFO:DataProfiler.profilers.profile_builder: Calculating the statistics... 



INFO:DataProfiler.profilers.profile_builder:Calculating the statistics... 
100%|██████████| 13/13 [00:04<00:00,  2.92it/s]


In [33]:
data.describe(include = 'all')

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,10841.0,10841,10841,10841,10841,10841,10841,10841,10841,10841,10841
unique,9660,34,41.0,6002,462,22,4,93,7,120,1378,2834,35
top,ROBLOX,FAMILY,,0,Varies with device,"1,000,000+",Free,0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,1474.0,596,1695,1579,10039,10040,8714,842,326,1459,2451


In [34]:
data.columns

Index(['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type',
       'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver',
       'Android Ver'],
      dtype='object')

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [56]:
import pandas as pd

def check_csv_for_issues(df):
    missing_values = df.isnull().sum()
    duplicate_rows = df.duplicated().sum()
    inconsistent_types = df.applymap(type).nunique() > 1

    inconsistent_formats = not pd.to_datetime(df['Last Updated'], errors='coerce').notnull().all()
    rating_outliers = (df['Rating'] < 0) | (df['Rating'] > 5)

    issues_found = False

    if missing_values.any():
        issues_found = True
        print("Columns with missing values:")
        print(missing_values[missing_values > 0])

    if duplicate_rows > 0:
        issues_found = True
        print("Duplicates exists")
    if inconsistent_types.any():
        issues_found = True
        print("Columns with inconsistent data types:")
        print(inconsistent_types[inconsistent_types].index.tolist())
    if inconsistent_formats:
        issues_found = True
        print("Columns with inconsistent data formats:")
        print("You need to customize this part based on your specific data formats")

    if rating_outliers.any():
        issues_found = True
        print("Columns with rating outliers:")
        print("Rating column is affected by rating outliers.")

    if not issues_found:
        print("No issues found in the CSV file. Data cleaning is not needed.")

csv_file = '/content/googleplaystore.csv'

df = pd.read_csv(csv_file)

check_csv_for_issues(df)


Columns with missing values:
Rating            1474
Type                 1
Content Rating       1
Current Ver          8
Android Ver          3
dtype: int64
Duplicates exists
Columns with inconsistent data types:
['Type', 'Content Rating', 'Current Ver', 'Android Ver']
Columns with inconsistent data formats:
You need to customize this part based on your specific data formats
Columns with rating outliers:
Rating column is affected by rating outliers.
