# Import Library


In [22]:
import pandas as pd
import os
import numpy as np
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# Define extract function


# DISPLAY SECTION


## Type


In [23]:
def handleNull_Type(data: pd.DataFrame):
    # tim kiem va cap nhat bang ten dien thoai
    # Motorola V560
    data.loc[data['DISPLAY_Type'].isnull() & (
        data['Name'] == 'Motorola V560'), 'DISPLAY_Type'] = 'TFT'
    # Sendo S1
    data.loc[data['DISPLAY_Type'].isnull() & (
        data['Name'] == 'Sendo S1'), 'DISPLAY_Type'] = 'TFT'
    # Vertu Diamond
    data.loc[data['DISPLAY_Type'].isnull() & (
        data['Name'] == 'Vertu Diamond'), 'DISPLAY_Type'] = 'TFT'
    # Vertu Ascent
    data.loc[data['DISPLAY_Type'].isnull() & (
        data['Name'] == 'Vertu Ascent'), 'DISPLAY_Type'] = 'Graphical, TFD'


def unifyType(data: pd.DataFrame):
    # lowercase
    data['DISPLAY_Type'] = data['DISPLAY_Type'].str.lower()

    # TFT neu co chua các từ sau: 'tft'
    data.loc[data['DISPLAY_Type'].str.contains(
        'tft'), 'DISPLAY_Type'] = 'TFT'

    # TFD neu co chua các từ sau: 'tfd'
    data.loc[data['DISPLAY_Type'].str.contains(
        'tfd'), 'DISPLAY_Type'] = 'TFD'

    # IPS neu chua: ips
    data.loc[data['DISPLAY_Type'].str.contains(
        'ips') == True, 'DISPLAY_Type'] = 'IPS'

    # OLED neu chua: oled
    data.loc[data['DISPLAY_Type'].str.contains(
        'oled') == True, 'DISPLAY_Type'] = 'OLED'

    # CSTN neu chua: cstn
    data.loc[data['DISPLAY_Type'].str.contains(
        'cstn') == True, 'DISPLAY_Type'] = 'CSTN'

    # FSTN neu chua: fstn
    data.loc[data['DISPLAY_Type'].str.contains(
        'fstn') == True, 'DISPLAY_Type'] = 'FSTN'

    # STN neu chua: stn
    data.loc[data['DISPLAY_Type'].str.contains(
        'stn') == True, 'DISPLAY_Type'] = 'STN'

    # TN neu chua: tn
    data.loc[data['DISPLAY_Type'].str.contains(
        'tn') == True, 'DISPLAY_Type'] = 'TN'

    # PLS neu chua: pls
    data.loc[data['DISPLAY_Type'].str.contains(
        'pls') == True, 'DISPLAY_Type'] = 'PLS'

    # Alphanumeric neu chua: alphanumeric
    data.loc[data['DISPLAY_Type'].str.contains(
        'alphanumeric') == True, 'DISPLAY_Type'] = 'Alphanumeric'

    # Monochrome neu chua: monochrome
    data.loc[data['DISPLAY_Type'].str.contains(
        'monochrome') == True, 'DISPLAY_Type'] = 'Monochrome'

    # Grayscale neu chua: grayscale, greyscale
    data.loc[data['DISPLAY_Type'].str.contains(
        'grayscale|greyscale') == True, 'DISPLAY_Type'] = 'Grayscale'

    # Backlit neu chua: backlit
    data.loc[data['DISPLAY_Type'].str.contains(
        'backlit') == True, 'DISPLAY_Type'] = 'Backlit'

    # S-LCD neu chua: s-lcd, super lcd, super-lcd, superlcd
    data.loc[data['DISPLAY_Type'].str.contains(
        's-lcd|super lcd|super-lcd|superlcd') == True, 'DISPLAY_Type'] = 'S-LCD'

    # color neu chua: color
    data.loc[data['DISPLAY_Type'].str.contains(
        'color') == True, 'DISPLAY_Type'] = 'Color'

    # LCD neu chua: lcd, crystal,  mva, pureled
    data.loc[data['DISPLAY_Type'].str.contains(
        'lcd|crystal|mva|pureled') == True, 'DISPLAY_Type'] = 'LCD'

    # Con lai la 'Unknown'
    data['DISPLAY_Type'].replace([i for i in data['DISPLAY_Type'].unique() if i not in ['TFT', 'TFD', 'IPS', 'OLED', 'CSTN', 'FSTN',
                                 'STN', 'TN', 'PLS', 'Alphanumeric', 'Monochrome', 'Grayscale', 'Backlit', 'S-LCD', 'Color', 'LCD']], 'Unknown', inplace=True)


def extract_display_type(data: pd.DataFrame, inplace: bool = False):
    if inplace == False:
        data = data.copy()

    # handleNull_Type(data)
    unifyType(data)
    return data

## Size


In [24]:
def handleNull_Size(data: pd.DataFrame):
    # Nếu trong giá trị ko có 'inch' thì được coi là null
    data.replace([i for i in data['DISPLAY_Size'].unique()
                 if 'inch' not in str(i)], np.nan, inplace=True)

    data['DISPLAY_Size'].fillna(data['DISPLAY_Size'].mode()[0], inplace=True)


def unifySize(data: pd.DataFrame):
    # Nếu trong giá trị ko có 'inch' thì được coi là null
    data.replace([i for i in data['DISPLAY_Size'].unique()
                 if 'inch' not in str(i)], np.nan, inplace=True)
    
    # Tach chuoi tai tu inch va lay phan truoc
    # data['DISPLAY_Size'] = data['DISPLAY_Size'].str.split(' inch').str[0]

    # Duyet qua tung dong
    for i in range(len(data['DISPLAY_Size'])):
        # Neu la null thi bo qua
        if pd.isnull(data.loc[i, 'DISPLAY_Size']):
            continue

        # Tach chuoi tai tu inch va lay phan truoc
        data.loc[i, 'DISPLAY_Size'] = data.loc[i, 'DISPLAY_Size'].split(' inch')[0]

        try:
            data.loc[i, 'DISPLAY_Size'] = float(data.loc[i, 'DISPLAY_Size'])
        except:
            print(i, data.loc[i, 'DISPLAY_Size'])

    # Chuyen ve dang float cho toan bo cot
    try:
        data['DISPLAY_Size'] = data['DISPLAY_Size'].astype(float)
    except:
        print('Cannot convert to float')


def extract_display_size(data: pd.DataFrame, inplace: bool = False):
    if inplace == False:
        data = data.copy()

    # handleNull_Size(data)
    unifySize(data)
    return data

## Resolution


In [25]:
# Dien null bang gia tri mode
def handleNull_Res(data: pd.DataFrame):
    data['DISPLAY_Resolution'].fillna(
        data['DISPLAY_Resolution'].mode()[0], inplace=True)

# Tach ra cac cot DISPLAY_Resolution_Width va DISPLAY_Resolution_Height


def splitRes(data: pd.DataFrame) -> bool:
    # Tach chuoi tai ' x '
    width = []
    height = []
    for i in range(len(data['DISPLAY_Resolution'])):
        # fail_to_split = 0

        # Neu dong do NULL thi bo qua
        if pd.isnull(data.loc[i, 'DISPLAY_Resolution']):
            width.append(np.nan)
            height.append(np.nan)
            continue

        try:
            w = data.loc[i, 'DISPLAY_Resolution'].split('x')[0].strip()
            w = int(w)
            width.append(w)
        except:
            # fail_to_split += 1
            width.append(np.nan)

        try:
            h = data.loc[i, 'DISPLAY_Resolution']
            try:
                if 'x ' in h:
                    h = h.split('x', maxsplit=1)[1].strip()
            except:
                pass
            try:
                h = h.split(' ', maxsplit=1)[0].strip()
            except:
                pass
            try:
                h = h.split('pixels', maxsplit=1)[0].strip()
            except:
                pass
            try:
                h = h.split(',', maxsplit=1)[0].strip()
            except:
                pass

            h = int(h)
            height.append(h)
        except:
            try:
                h = data.loc[i, 'DISPLAY_Resolution'].split(' ')[2].strip()
                h = int(h)
                height.append(h)
            except:
                # fail_to_split += 1
                height.append(np.nan)

        # if fail_to_split == 1:
        #     print(i, data.loc[i, 'DISPLAY_Resolution'])

    # Tach ra thanh 2 cot moi sau RESOLUTION
    cols = data.columns
    try:
        data.insert(cols.get_loc('DISPLAY_Resolution') +
                    1, 'DISPLAY_Resolution_Width', width)
        data.insert(cols.get_loc('DISPLAY_Resolution')+2,
                    'DISPLAY_Resolution_Height', height)
        # bo cot DISPLAY_Resolution
        data.drop('DISPLAY_Resolution', axis=1, inplace=True)
        return True
    except:
        pass

    return False


def handleNull_ResHeight(data: pd.DataFrame):
    data['DISPLAY_Resolution_Height'].fillna(
        data['DISPLAY_Resolution_Height'].mode()[0], inplace=True)


def handleNull_ResWidth(data: pd.DataFrame):
    # Dien bang cot DISPLAY_Resolution_Height
    data['DISPLAY_Resolution_Width'].fillna(
        data['DISPLAY_Resolution_Height'], inplace=True)


def extract_display_resolution(data: pd.DataFrame, inplace: bool = False):
    if inplace == False:
        data = data.copy()

    # handleNull_Res(data)
    splitRes(data)
    # handleNull_ResHeight(data)
    # handleNull_ResWidth(data)
    return data

## Protection


In [26]:
def extract_display_protection(data: pd.DataFrame, inplace: bool = False):
    if inplace == False:
        data = data.copy()

    # drop DISPLAY_Protection
    data.drop('DISPLAY_Protection', axis=1, inplace=True)

    return data

## Combine


In [27]:
def extract_display(data: pd.DataFrame, inplace: bool = False):
    if inplace == False:
        data = data.copy()

    extract_display_type(data, inplace=True)
    extract_display_size(data, inplace=True)
    extract_display_resolution(data, inplace=True)
    extract_display_protection(data, inplace=True)

    return data

# PLATFORM SECTION


## OS


In [28]:
def handleNull_OS(data: pd.DataFrame):
    # fill na with other
    data['PLATFORM_OS'] = data['PLATFORM_OS'].fillna('Other')


def extract_OS(data: pd.DataFrame):
    # chay qua tung dong
    for i in range(len(data)):
        # lay gia tri cua PLATFORM_OS
        OS = data['PLATFORM_OS'][i]

        # Neu OS la null thi bo qua
        if pd.isnull(OS):
            continue

        OS = OS.lower()
        # Neu OS khong phai la Other thi tach ra
        if "android" in OS:
            # Tiềm kiếm từ "Androidxxx" trong chuỗi bằng regex
            Actural_OS = re.findall('android\s?\d*', OS)[0]

            # replace
            data.loc[i, 'PLATFORM_OS'] = Actural_OS

        if "tizen" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'tizen'

        if "kaios" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'kaios'

        if "proprietary" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'proprietary'

        if "firefox" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'firefox'

        if "bada" in OS:
            data.loc[i, "PLATFORM_OS"] = 'bada'

        if "palm" in OS:
            data.loc[i, "PLATFORM_OS"] = 'palm'


def extract_platform_os(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    # handleNull_OS(data)
    extract_OS(data)

    return data

## CHIPSET


In [29]:
def extract_platform_chipset(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    # drop platform chipset
    data.drop('PLATFORM_Chipset', axis=1, inplace=True)

    return data

## CPU


In [30]:
def extract_platform_cpu(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    # drop platform cpu
    data.drop('PLATFORM_CPU', axis=1, inplace=True)

    return data

## GPU


In [31]:
def extract_platform_gpu(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    # drop platform gpu
    data.drop('PLATFORM_GPU', axis=1, inplace=True)

    return data

## COMBINE


In [32]:
def extract_platform(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    extract_platform_os(data, inplace=True)
    extract_platform_chipset(data, inplace=True)
    extract_platform_cpu(data, inplace=True)
    extract_platform_gpu(data, inplace=True)

    return data

# MEMORY SECTION


## Card Slot


In [33]:
def unifyCardSlot(data: pd.DataFrame):
    data['MEMORY_Card_slot'] = data['MEMORY_Card_slot'].str.lower()
    for i in range(len(data['MEMORY_Card_slot'])):
        if 'microsdhc' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'microsdhc'

        elif 'microsdxc' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'microsdxc'

        elif 'microsd' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'microsd'

        elif 'mmc' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'mmc'

        elif 'minisd' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'minisd'

        elif 'memory stick micro' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'memory stick micro'

        elif 'memory stick duo' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'memory stick duo'

        elif 'nm (nano memory)' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'nano memory'

        elif 'sdio' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'sdio'

        elif 'sd' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'sd'

        elif 'no' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'no'

        else:
            data.loc[i, 'MEMORY_Card_slot'] = 'unspecified'


def extract_memory_cardslot(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    unifyCardSlot(data)

    return data

## Internal


In [34]:
def split_Internal(data: pd.DataFrame) -> pd.DataFrame:
    rom_values = []
    ram_values = []

    for i in range(len(data['MEMORY_Internal'])):
        # Check if the value is null
        if pd.isnull(data['MEMORY_Internal'][i]):
            rom_values.append(None)
            ram_values.append(None)
            continue

        # Remove data inside parentheses
        internal_spec = re.sub(r'\([^)]*\)', '', data['MEMORY_Internal'][i])

        # Chuyen ve chu thuong
        internal_spec = internal_spec.lower()

        # Find all numeric values with optional units (e.g., GB, MB)
        numeric_values = re.findall(r'(\d+(\.\d+)?)([gmb]+)', internal_spec)

        # Neu don vi la GB va MB thi chuyen ve KB 1GB = 1024MB = 1048576KB
        values = []
        for value in numeric_values:
            try:
                if 'g' in value[2]:
                    value = float(value[0]) * 1024 * 1024
                elif 'm' in value[2]:
                    value = float(value[0]) * 1024
                else:
                    value = float(value[0])
            except:
                value = None
            values.append(value)

        # Phan ra thanh rom va ram
        if len(values) == 1:
            rom = values[0]
            ram = None

        elif len(values) >= 2:
            # Neu co nhieu hon hoac 2 gia tri thi lay gia tri lon nhat lam rom
            rom = max(values)
            # Va gia tri lon nhat nho hon mean(values) lam ram
            ram = max([value for value in values if value <= np.mean(values)])

        else:
            rom = None
            ram = None

        rom_values.append(rom)
        ram_values.append(ram)

    # Tao ra 2 cot moi la MEMORY_Internal_rom va MEMORY_Internal_ram ngay sau cot MEMORY_Internal
    cols = data.columns
    rom_col_index = cols.get_loc('MEMORY_Internal') + 1
    ram_col_index = rom_col_index + 1
    data.insert(rom_col_index, 'MEMORY_Internal_rom', rom_values)
    data.insert(ram_col_index, 'MEMORY_Internal_ram', ram_values)

    # Xoa cot MEMORY_Internal
    data.drop('MEMORY_Internal', axis=1, inplace=True)

    return data


def extract_memory_internal(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    split_Internal(data)

    return data

## Combine


In [35]:
def extract_memory(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    extract_memory_cardslot(data, inplace=True)
    extract_memory_internal(data, inplace=True)

    return data

# Extract data

## Load data

In [36]:
data = pd.read_csv(os.path.join('..', 'data', 'Devices_Specs.csv'))


def inspect_data(data):
    dataCols = data.columns
    NullSum = data.isnull().sum()
    NullPercent = NullSum/len(data)*100
    UniqueSum = data.nunique()
    ColType = data.dtypes
    # nsamples = 5
    # Samples = [', '.join([str(data[col][i]) for i in range(nsamples)]) for col in data.columns]
    n_UniqueSample = 5
    UniqueSamples = [', '.join([str(data[col].unique()[i]) for i in range(n_UniqueSample if len(
        data[col].unique()) > n_UniqueSample else len(data[col].unique()))]) for col in data.columns]

    Inspec = pd.DataFrame({'Column': dataCols, 'NullSum': NullSum, 'NullPercent%': NullPercent,
                          'UniqueSum': UniqueSum, 'Type': ColType, 'Unique Sample': UniqueSamples})
    Inspec.index = [i for i in range(1, len(data.columns)+1)]
    return Inspec

inspect_data(data)

Unnamed: 0,Column,NullSum,NullPercent%,UniqueSum,Type,Unique Sample
1,Brand,0,0.0,121,object,"Acer, alcatel, Allview, Amazon, Amoi"
2,url,0,0.0,12513,object,"https://www.gsmarena.com/acer_chromebook_tab_10-9139.php, https://www.gsmarena.com/acer_iconia_talk_s-8306.php, https://www.gsmarena.com/acer_liquid_z6_plus-8305.php, https://www.gsmarena.com/acer_liquid_z6-8304.php, https://www.gsmarena.com/acer_iconia_tab_10_a3_a40-8080.php"
3,imgUrl,0,0.0,11973,object,"https://fdn2.gsmarena.com/vv/bigpic/acer-chromebook-tab-10.jpg, https://fdn2.gsmarena.com/vv/bigpic/acer-iconia-talk-s.jpg, https://fdn2.gsmarena.com/vv/bigpic/acer-liquid-z6-plus.jpg, https://fdn2.gsmarena.com/vv/bigpic/acer-liquid-z6.jpg, https://fdn2.gsmarena.com/vv/bigpic/acer-iconia-tab-10-a3-a40.jpg"
4,Name,0,0.0,12513,object,"Acer Chromebook Tab 10, Acer Iconia Talk S, Acer Liquid Z6 Plus, Acer Liquid Z6, Acer Iconia Tab 10 A3-A40"
5,NETWORK_Technology,0,0.0,45,object,"No cellular connectivity, GSM / HSPA / LTE, GSM / HSPA, GSM, GSM / UMTS"
6,NETWORK_2G_bands,0,0.0,223,object,"N/A, GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2, GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only), GSM 850 / 900 / 1800 / 1900 , GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 & SIM 3"
7,NETWORK_3G_bands,3835,30.648126,743,object,"nan, HSDPA 850 / 1900 / 2100 , HSDPA, HSDPA 900 / 1900 / 2100 - Europe, Taiwan, HSDPA 900 / 2100"
8,NETWORK_4G_bands,7228,57.763926,1550,object,"nan, 1, 3, 7, 8, 20, LTE (unspecified), LTE 800 / 1800 / 2100 / 2600 - Europe, 1, 3, 7, 20"
9,NETWORK_5G_bands,11426,91.313034,405,object,"nan, 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 30, 38, 40, 41, 48, 53, 66, 70, 77, 78, 79 SA/NSA/Sub6 - A3106, 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 30, 38, 40, 41, 48, 53, 66, 70, 77, 78, 79 SA/NSA/Sub6 - A3102, 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 30, 38, 40, 41, 48, 53, 66, 70, 77, 78, 79 SA/NSA/Sub6 - A3094, 1, 2, 3, 5, 7, 8, 12, 20, 25, 26, 28, 30, 38, 40, 41, 48, 53, 66, 70, 77, 78, 79 SA/NSA/Sub6 - A3090"
10,NETWORK_GPRS,8335,66.610725,32,object,"No, nan, Yes, Class 10, Class 12"


## Extract data

In [37]:
data = extract_display(data)
data = extract_platform(data)
data = extract_memory(data)

In [38]:
extracted_cols = [
    'DISPLAY_Type',
    'DISPLAY_Size',
    'DISPLAY_Resolution_Width',
    'DISPLAY_Resolution_Height',
    'PLATFORM_OS',
    'MEMORY_Card_slot',
    'MEMORY_Internal_rom',
    'MEMORY_Internal_ram'
]

data = data[extracted_cols]

In [39]:
inspect_data(data)

Unnamed: 0,Column,NullSum,NullPercent%,UniqueSum,Type,Unique Sample
1,DISPLAY_Type,0,0.0,17,object,"IPS, OLED, TFT, LCD, Color"
2,DISPLAY_Size,1225,9.789819,303,float64,"9.7, 7.0, 5.5, 5.0, 10.1"
3,DISPLAY_Resolution_Width,132,1.054903,143,float64,"1536.0, 720.0, 1080.0, 1920.0, 480.0"
4,DISPLAY_Resolution_Height,54,0.431551,224,float64,"2048.0, 1280.0, 1920.0, 1200.0, 854.0"
5,PLATFORM_OS,3772,30.14465,297,object,"Chrome OS, android 6, android 5, Microsoft Windows 10, android 4"
6,MEMORY_Card_slot,0,0.0,12,object,"microsdxc, microsdhc, no, microsd, minisd"
7,MEMORY_Internal_rom,1877,15.0004,221,float64,"33554432.0, 8388608.0, 67108864.0, 16777216.0, 4194304.0"
8,MEMORY_Internal_ram,4152,33.181491,80,float64,"4194304.0, 2097152.0, 3145728.0, 1048576.0, 16777216.0"


-> 2 cột rom với ram cân nhắc nên fill null bằng min hay 0 sẽ tốt hơn

## Save data

In [40]:
data.to_csv(os.path.join('..', 'tidy_data', 'Devices_Specs_Extracted.csv'), index=False)