In [2]:
# Import necessary libraries
import pandas as pd

# Load the dataset
df = pd.read_csv('googleplaystore.csv')

# Display the first few rows
df.head()

# Get an overview of the data structure
df.info()

# See the number of missing values in each column
df.isnull().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10358 entries, 0 to 10357
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  object 
 4   Size            10358 non-null  object 
 5   Installs        10358 non-null  object 
 6   Type            10357 non-null  object 
 7   Price           10358 non-null  object 
 8   Content Rating  10357 non-null  object 
 9   Genres          10358 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10355 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.0+ MB


App                  0
Category             0
Rating            1465
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sys
import re

# Load the dataset
df = pd.read_csv('googleplaystore.csv')

# Display the first few rows
df.head()

# Get an overview of the data structure
df.info()

# See the number of missing values in each column
df.isnull().sum()


# Drop rows with missing values in 'Rating', as this is the target variable
df.dropna(subset=['Rating'], inplace=True)


# Convert Installs to integers
# using errors='coerce' will convert all non numeric values to NaNs which can then be dealt with
df['Installs'] = df['Installs'].astype(str).str.replace(r'[+,]', '', regex=True).apply(lambda x: int(x) if x.isdigit() else np.nan)
df.dropna(subset=['Installs'], inplace = True)
df['Installs'] = df['Installs'].astype(int)


# Convert Price to floats
df['Price'] = df['Price'].astype(str).str.replace(r'\$', '', regex=True).astype(float,errors='ignore')


 # Convert Size column to numeric (MB)
def convert_size(size):
    if isinstance(size, str):
      if 'M' in size:
         return float(size[:-1])
      elif 'k' in size:
          return float(size[:-1]) / 1024
    return size
df['Size'] = df['Size'].apply(convert_size)

# Replace 'Varies with device' with NaN so it can be dropped if needed.
df['Size'].replace('Varies with device', np.nan, inplace=True)

# Remove rows with NaN sizes
df.dropna(subset=['Size'], inplace = True)

# Convert size column to a float
df['Size'] = df['Size'].astype(float)

# Convert 'Last Updated' to datetime objects
df['Last Updated'] = pd.to_datetime(df['Last Updated'])

# Identify duplicate rows based on all columns
duplicates = df.duplicated()

# Filter out duplicate rows
df = df[~duplicates]

# Display the cleaned data's info to see the result
df.info()
df.isnull().sum()

# Display the first few rows of the processed data
df.head()

# Save the cleaned data to a csv file
df.to_csv('../AI MODEL/processed_data.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10358 entries, 0 to 10357
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10358 non-null  object 
 1   Category        10358 non-null  object 
 2   Rating          8893 non-null   float64
 3   Reviews         10358 non-null  object 
 4   Size            10358 non-null  object 
 5   Installs        10358 non-null  object 
 6   Type            10357 non-null  object 
 7   Price           10358 non-null  object 
 8   Content Rating  10357 non-null  object 
 9   Genres          10358 non-null  object 
 10  Last Updated    10358 non-null  object 
 11  Current Ver     10350 non-null  object 
 12  Android Ver     10355 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.0+ MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Size'].replace('Varies with device', np.nan, inplace=True)
  df['Size'].replace('Varies with device', np.nan, inplace=True)
  df['Last Updated'] = pd.to_datetime(df['Last Updated'])


<class 'pandas.core.frame.DataFrame'>
Index: 7424 entries, 0 to 10357
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   App             7424 non-null   object        
 1   Category        7424 non-null   object        
 2   Rating          7424 non-null   float64       
 3   Reviews         7424 non-null   object        
 4   Size            7424 non-null   float64       
 5   Installs        7424 non-null   int64         
 6   Type            7424 non-null   object        
 7   Price           7424 non-null   float64       
 8   Content Rating  7424 non-null   object        
 9   Genres          7424 non-null   object        
 10  Last Updated    7424 non-null   datetime64[ns]
 11  Current Ver     7420 non-null   object        
 12  Android Ver     7422 non-null   object        
dtypes: datetime64[ns](1), float64(3), int64(1), object(8)
memory usage: 812.0+ KB
