# Mobile Price Prediction data preprocessing

## Imports

In [1]:
import pandas as pd
import re
from IPython.display import display

## Load the dataset

In [2]:
df = pd.read_csv('mobile_data.csv')

## List of columns to keep

In [3]:
columns_to_keep = [
    'Rating', 'Spec_score', 'No_of_sim', 'Ram', 'Battery', 'Display', 'Camera', 
    'External_Memory', 'Android_version', 'Price', 'Inbuilt_memory', 
    'fast_charging', 'Screen_resolution', 'Processor', 'Processor_name'
]
df = df[columns_to_keep]

## Preprocessing 'No_of_sim' Column
- Find all unique features in the 'No_of_sim' column

In [4]:
unique_features = set()
df['No_of_sim'].apply(lambda x: unique_features.update(x.split(', ')))

0       None
1       None
2       None
3       None
4       None
        ... 
1365    None
1366    None
1367    None
1368    None
1369    None
Name: No_of_sim, Length: 1370, dtype: object

- Remove empty string from unique features

In [5]:
unique_features.discard('')

- Create boolean columns for each unique feature

In [6]:
for feature in unique_features:
    df[feature] = df['No_of_sim'].apply(lambda x: feature in x)

- Drop the original 'No_of_sim' column as it's no longer needed

In [7]:
df.drop('No_of_sim', axis=1, inplace=True)

## Preprocessing 'Ram' Column
- Keep only numeric value

In [8]:
df['Ram'] = df['Ram'].apply(lambda x: int(re.search(r'\d+', x).group()))

## Preprocessing 'Battery' Column
- Keep only numeric value

In [9]:
df['Battery'] = df['Battery'].apply(lambda x: int(re.search(r'\d+', x).group()))

## Preprocessing 'Display' Column
- Keep only numeric value

In [10]:
df['Display'] = df['Display'].apply(lambda x: float(re.search(r'\d+(\.\d+)?', x).group()))

## Preprocessing 'Camera' Column
- Functions

In [11]:
def extract_rear_camera_info(camera_str):
    rear_cameras = re.findall(r'(\d+) MP', camera_str.split('&')[0])
    rear_camera_count = len(rear_cameras)
    highest_rear_camera = max(map(int, rear_cameras)) if rear_cameras else None
    return rear_camera_count, highest_rear_camera

def extract_front_camera_info(camera_str):
    front_cameras = re.findall(r'(\d+) MP Front', camera_str)
    front_camera_count = len(front_cameras)
    highest_front_camera = max(map(int, front_cameras)) if front_cameras else None
    return front_camera_count, highest_front_camera

- Apply the functions to create new columns

In [12]:
df['Rear_Camera_Count'], df['Highest_Rear_Camera'] = zip(*df['Camera'].apply(extract_rear_camera_info))
df['Front_Camera_Count'], df['Highest_Front_Camera'] = zip(*df['Camera'].apply(extract_front_camera_info))

- Drop the original 'Camera' column as it's no longer needed

In [13]:
df.drop('Camera', axis=1, inplace=True)

## Preprocessing 'External_Memory' Column
- Function to determine if memory card is supported

In [14]:
def is_memory_card_supported(memory_str):
    return 'Memory Card Supported' in memory_str or 'Memory Card (Hybrid)' in memory_str

- Function to determine if memory card slot is hybrid

In [15]:
def is_memory_card_hybrid(memory_str):
    return 'Memory Card (Hybrid)' in memory_str

- Function to extract the maximum memory card limit

In [16]:
def extract_memory_card_limit(memory_str):
    match = re.search(r'(\d+)\s?(GB|TB)', memory_str)
    if match:
        size = int(match.group(1))
        unit = match.group(2)
        return size * 1024 if unit == 'TB' else size
    return None

- Create new columns based on the above functions

In [17]:
df['Memory_Card_Supported'] = df['External_Memory'].apply(is_memory_card_supported)
df['Memory_Card_Hybrid'] = df['External_Memory'].apply(is_memory_card_hybrid)
df['Memory_Card_Max_Limit'] = df['External_Memory'].apply(extract_memory_card_limit)

- Drop the original 'External_Memory' column as it's no longer needed

In [18]:
df.drop('External_Memory', axis=1, inplace=True)

## Preprocessing 'Android_version' Column
- Function

In [19]:
def extract_android_version(version_str):
    if pd.notnull(version_str):
        version_str = str(version_str)
        match = re.search(r'\d+(\.\d+)?', version_str)
        if match:
            return float(match.group(0))
    return None

- Keep only numeric value

In [20]:
df['Android_version'] = df['Android_version'].apply(extract_android_version)

## Preprocessing 'Price' Column
- Function

In [21]:
def convert_price(price_str):
    if pd.notnull(price_str):
        # Remove commas and convert to integer
        price_str = str(price_str).replace(',', '')
        return int(price_str)
    return None

- Convert to numeric value

In [22]:
df['Price'] = df['Price'].apply(convert_price)

## Preprocessing 'Inbuilt_memory' Column
- Function

In [23]:
def extract_inbuilt_memory(memory_str):
    if pd.notnull(memory_str):
        match = re.search(r'\d+', memory_str)
        if match:
            return int(match.group(0))
    return None

- Keep only numeric value

In [24]:
df['Inbuilt_memory'] = df['Inbuilt_memory'].apply(extract_inbuilt_memory)

## Preprocessing 'fast_charging' Column
- Function

In [25]:
def extract_fast_charging(charging_str):
    if pd.notnull(charging_str):
        match = re.search(r'\d+', charging_str)
        if match:
            return int(match.group(0))
    return None

- Keep only numeric value

In [26]:
df['fast_charging'] = df['fast_charging'].apply(extract_fast_charging)

## Preprocessing 'Screen_resolution' Column
- Function

In [27]:
def extract_resolution_only(res_str):
    if pd.notnull(res_str):
        match = re.search(r'(\d+\s+x\s+\d+)', res_str)
        if match:
            return match.group(1)
    return None

- Keep only width x height part

In [28]:
df['Screen_resolution'] = df['Screen_resolution'].apply(extract_resolution_only)

## Preprocessing 'Processor' Column
- Function to convert text to number

In [29]:
def replace_processor(processor_str):
    if isinstance(processor_str, str):
        if 'Octa Core' in processor_str:
            return 8
        elif 'Nine' in processor_str:
            return 9
        elif 'Deca Core' in processor_str:
            return 10
    return None

- Change the column name from Processor to core

In [30]:
df['core'] = df['Processor'].apply(replace_processor)

- Drop the old 'Processor' column

In [31]:
df.drop('Processor', axis=1, inplace=True)

## Preprocessing 'Processor_name' Column
- Define popular processor family

In [32]:
known_processors = [
    'Exynos', 'Helio', 'Dimensity', 'Snapdragon', 'Tensor', 'Kirin', 'Unisoc', 'Tiger'
]

- Function to extract processor family

In [33]:
def extract_processor_family(name):
    for processor in known_processors:
        if processor in name:
            return processor
    return ''

- replace with processor family

In [34]:
df['Processor_name'] = df['Processor_name'].apply(extract_processor_family)

## Download the preprocessed csv file

In [35]:
df.to_csv('preprocessed_mobile_data.csv', index=False)