# Data Cleaning

In [396]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [397]:
data = pd.read_csv("datasets/data.csv")

#### Extract product model from title

In [398]:
def model_extractor(name: str):
    brand = name.lower().split("iphone")[-1]
    result = ""
    if len(brand) == 1 or len(brand) == 2:
        result = brand
    elif len(brand.split()) > 1 and brand.split()[0] in ['8plus', 'se', '5', '5s', '5c', '6', '6s', '6c', '7', '7s', '7c', '8+']:
        result = brand.split()[0]
    elif len(brand.split()) > 1 and brand.split()[1].isdigit():
        result = brand.split()[0]
    elif len(brand.split()) > 1 and brand.split()[0] in ['xr','x','xs','5','6','7','8','9','10','11','12']:
        if brand.split()[1] == 'pro':
            if len(brand.split()) > 2 and (brand.split()[2].isalnum() or brand.split()[2].isdigit()):
                result = f"{brand.split()[0]} {brand.split()[1]}"
            elif len(brand.split()) > 2 and brand.split()[2] == 'max':
                result = f"{brand.split()[0]} {brand.split()[1]} {brand.split()[2]}"
            else:
                result = f"{brand.split()[0]} {brand.split()[1]}"
        elif brand.split()[1] == 'max':
            if len(brand.split()) > 2 and (brand.split()[2].isalnum() or brand.split()[2].isdigit()):
                result = f"{brand.split()[0]} {brand.split()[1]}"
            elif len(brand.split()) > 2 and brand.split()[2] == 'pro':
                result = f"{brand.split()[0]} {brand.split()[1]} {brand.split()[2]}"
            else:
                result = f"{brand.split()[0]} {brand.split()[1]}"
        elif brand.split()[1].isalnum():
            result = brand.split()[0]
        else:
            result = brand
    else:
        result = brand
        
    return result 

data["model"] = data.name.apply(model_extractor)

In [399]:
data["model"] = data.model.apply(lambda x: x.strip())

In [400]:
data.drop(data[data.model.str.len() > 15].index, inplace=True)

#### Remove "Capacitors type" junk from all columns

In [401]:
for col in data.columns:
    data[col] = data[col].apply(lambda x: x.replace("Capacitors type", "") if isinstance(x, str) else x)

#### Format price column

In [402]:
def is_float(text: str) -> bool:
    try:
        float(text)
        return True
    except ValueError:
        return False

data.price = data.price.apply(lambda x: float(x.strip("\ue600")) if is_float(x.strip("\ue600")) else x)

In [403]:
data.loc[data.price == "Price negotiable", "price"] = pd.NA
data.loc[198].model = '8'
data.loc[198].price = data[data.model == '8'].price.mean()

data.loc[357].price = data[data.model == 'x'].price.mean()
data.drop(data.index[279], inplace=True)
data[data.price.isna()]

Unnamed: 0,name,price,3G Network,5G,Battery Capacity,Card slot,Display type,GPS,Internal Memory,LTE (4G) Network,...,Screen size,SIM Multiple,Sim Size,Touch Screen,Wi-Fi,Year,OS version,Selfie Camera,IP Certificate,model


In [404]:
data.price = data.price.apply(lambda x: float(x))

#### Format number of cameras

In [405]:
def format_number_of_cameras(cameras: str):
    cameras = cameras.lower() 
    if cameras == 'one':
        return 1
    elif cameras == 'two':
        return 2
    elif cameras == 'three':
        return 3
    elif cameras == 'four':
        return 4
    elif cameras == 'five':
        return 5
    elif cameras == 'six':
        return 6
    elif cameras == 'no':
        return 0
    else:
        return pd.NA

data["Number of cameras"] = data["Number of cameras"].apply(format_number_of_cameras)

In [406]:
for i, row in data[data["Number of cameras"] > 3].iterrows():
    data.loc[i, "Number of cameras"] = data[data.model == row["model"]]["Number of cameras"].mode()[0]

#### Format Internal Memory

In [407]:
data['Internal Memory'].fillna(data['Internal Memory'].mode()[0], inplace=True)
data['Internal Memory'] = data["Internal Memory"].apply(lambda x: int(x.lower().replace("gb", "")))

#### Format Battery Capacity

In [408]:
data['Battery Capacity'] = data['Battery Capacity'].apply(lambda x: int(x))

#### Format RAM

In [409]:
def format_ram(ram: str):
    if 'mb' in ram.lower():
        return float(ram.lower().split('mb')[0].strip()) / 1024
    else:
        return float(ram.lower().replace('gb', '').strip())



data['RAM'].fillna(data['RAM'].mode()[0], inplace=True)
data['RAM'] = data["RAM"].apply(format_ram)

#### Format Primary and Selfie cameras

In [410]:
for i, row in data[data['Primary Camera'].isna()].iterrows():
    mode = data[data.model == row.model]["Primary Camera"].mode()
    if len(mode) > 1:
        data.loc[i, 'Primary Camera'] = mode[0]
    else:
        data.loc[i, 'Primary Camera'] = np.nan

for i, row in data[data['Selfie Camera'].isna()].iterrows():
    mode = data[data.model == row.model]['Selfie Camera'].mode()
    if len(mode) > 1:
        data.loc[i, 'Selfie Camera'] = mode[0]
    else:
        data.loc[i, 'Selfie Camera'] = np.nan

In [412]:
data.drop(data[data['Primary Camera'].isna()].index, inplace=True)
data.drop(data[data['Selfie Camera'].isna()].index, inplace=True)
data['Primary Camera'] = data['Primary Camera'].apply(lambda x: int(x.lower().replace('mp', '').strip()))
data['Selfie Camera'] = data['Selfie Camera'].apply(lambda x: int(x.lower().replace('mp', '').strip()))

#### Format Screen Size

In [422]:
data['Screen size'] = data['Screen size'].apply(lambda x: x.lower().replace('inch', '').strip())
data['Screen size'] = data['Screen size'].apply(lambda x: x.lower().replace('ორ less', '').strip())
data['Screen size'] = data['Screen size'].apply(lambda x: float(x))

In [426]:
data.to_csv("datasets/cleaned_data.csv")