# Import Library

In [None]:
import pandas as pd
import os
import numpy as np
import re

# Define extract function

# DISPLAY SECTION

## Type

In [None]:
def handleNull_Type(data: pd.DataFrame):
    # tim kiem va cap nhat bang ten dien thoai
    # Motorola V560
    data.loc[data['DISPLAY_Type'].isnull() & (data['Name'] == 'Motorola V560'), 'DISPLAY_Type'] = 'TFT'
    # Sendo S1
    data.loc[data['DISPLAY_Type'].isnull() & (data['Name'] == 'Sendo S1'), 'DISPLAY_Type'] = 'TFT'
    # Vertu Diamond
    data.loc[data['DISPLAY_Type'].isnull() & (data['Name'] == 'Vertu Diamond'), 'DISPLAY_Type'] = 'TFT'
    # Vertu Ascent
    data.loc[data['DISPLAY_Type'].isnull() & (data['Name'] == 'Vertu Ascent'), 'DISPLAY_Type'] = 'Graphical, TFD'

def unifyType(data: pd.DataFrame):
    # lowercase
    data['DISPLAY_Type'] = data['DISPLAY_Type'].str.lower()

    # TFT neu co chua các từ sau: 'tft'
    data.loc[data['DISPLAY_Type'].str.contains(
        'tft'), 'DISPLAY_Type'] = 'TFT'
    
    # TFD neu co chua các từ sau: 'tfd'
    data.loc[data['DISPLAY_Type'].str.contains(
        'tfd'), 'DISPLAY_Type'] = 'TFD'

    # IPS neu chua: ips
    data.loc[data['DISPLAY_Type'].str.contains(
        'ips') == True, 'DISPLAY_Type'] = 'IPS'
    
    # OLED neu chua: oled
    data.loc[data['DISPLAY_Type'].str.contains(
        'oled') == True, 'DISPLAY_Type'] = 'OLED'

    # CSTN neu chua: cstn
    data.loc[data['DISPLAY_Type'].str.contains(
        'cstn') == True, 'DISPLAY_Type'] = 'CSTN'
    
    # FSTN neu chua: fstn
    data.loc[data['DISPLAY_Type'].str.contains(
        'fstn') == True, 'DISPLAY_Type'] = 'FSTN'

    # STN neu chua: stn
    data.loc[data['DISPLAY_Type'].str.contains(
        'stn') == True, 'DISPLAY_Type'] = 'STN'

    # TN neu chua: tn
    data.loc[data['DISPLAY_Type'].str.contains(
        'tn') == True, 'DISPLAY_Type'] = 'TN'

    # PLS neu chua: pls
    data.loc[data['DISPLAY_Type'].str.contains(
        'pls') == True, 'DISPLAY_Type'] = 'PLS'
    
    # Alphanumeric neu chua: alphanumeric
    data.loc[data['DISPLAY_Type'].str.contains(
        'alphanumeric') == True, 'DISPLAY_Type'] = 'Alphanumeric'
    
    # Monochrome neu chua: monochrome
    data.loc[data['DISPLAY_Type'].str.contains(
        'monochrome') == True, 'DISPLAY_Type'] = 'Monochrome'

    # Grayscale neu chua: grayscale, greyscale
    data.loc[data['DISPLAY_Type'].str.contains(
        'grayscale|greyscale') == True, 'DISPLAY_Type'] = 'Grayscale'

    # Backlit neu chua: backlit
    data.loc[data['DISPLAY_Type'].str.contains(
        'backlit') == True, 'DISPLAY_Type'] = 'Backlit'
    
    # S-LCD neu chua: s-lcd, super lcd, super-lcd, superlcd
    data.loc[data['DISPLAY_Type'].str.contains(
        's-lcd|super lcd|super-lcd|superlcd') == True, 'DISPLAY_Type'] = 'S-LCD'
    
    # color neu chua: color
    data.loc[data['DISPLAY_Type'].str.contains(
        'color') == True, 'DISPLAY_Type'] = 'Color'
    
    # LCD neu chua: lcd, crystal,  mva, pureled
    data.loc[data['DISPLAY_Type'].str.contains(
        'lcd|crystal|mva|pureled') == True, 'DISPLAY_Type'] = 'LCD'
    
    # Con lai la 'Unknown'
    data['DISPLAY_Type'].replace([i for i in data['DISPLAY_Type'].unique() if i not in ['TFT', 'TFD', 'IPS', 'OLED', 'CSTN', 'FSTN', 'STN', 'TN', 'PLS', 'Alphanumeric', 'Monochrome', 'Grayscale', 'Backlit', 'S-LCD', 'Color', 'LCD']], 'Unknown', inplace=True)
        
def extract_display_type(data :pd.DataFrame, inplace :bool=False):
    if inplace == False:
        data = data.copy()
    
    handleNull_Type(data)
    unifyType(data) 
    return data

## Size

In [None]:
def handleNull_Size(data: pd.DataFrame):
    # Nếu trong giá trị ko có 'inch' thì được coi là null
    data.replace([i for i in data['DISPLAY_Size'].unique() if 'inch' not in str(i)], np.nan, inplace=True)
    
    data['DISPLAY_Size'].fillna(data['DISPLAY_Size'].mode()[0], inplace=True)

def unifySize(data: pd.DataFrame):
    # Tach chuoi tai tu inch va lay phan truoc
    data['DISPLAY_Size'] = data['DISPLAY_Size'].str.split(' inch').str[0]

    # Chuyen ve dang float cho tung dong
    for i in range(len(data['DISPLAY_Size'])):
        try:
            data.loc[i, 'DISPLAY_Size'] = float(data.loc[i, 'DISPLAY_Size'])
        except:
            print(i, data.loc[i, 'DISPLAY_Size'])
    
    # Chuyen ve dang float cho toan bo cot
    try:
        data['DISPLAY_Size'] = data['DISPLAY_Size'].astype(float)
    except:
        print('Cannot convert to float')

def extract_display_size(data: pd.DataFrame, inplace: bool=False):
    if inplace == False:
        data = data.copy()
    
    handleNull_Size(data)
    unifySize(data)
    return data

## Resolution

In [None]:
# Dien null bang gia tri mode 
def handleNull_Res(data: pd.DataFrame):
    data['DISPLAY_Resolution'].fillna(data['DISPLAY_Resolution'].mode()[0], inplace=True)

# Tach ra cac cot DISPLAY_Resolution_Width va DISPLAY_Resolution_Height
def splitRes(data: pd.DataFrame) -> bool:
    # Tach chuoi tai ' x '
    width = []
    height = []
    for i in range(len(data['DISPLAY_Resolution'])):
        # fail_to_split = 0

        try:
            w = data.loc[i, 'DISPLAY_Resolution'].split('x')[0].strip()
            w = int(w)
            width.append(w)
        except:
            # fail_to_split += 1
            width.append(np.nan)

        try:
            h = data.loc[i, 'DISPLAY_Resolution']
            try:
                if 'x ' in h:
                    h = h.split('x', maxsplit=1)[1].strip()
            except:
                pass
            try:
                h = h.split(' ', maxsplit=1)[0].strip()
            except:
                pass
            try:
                h = h.split('pixels', maxsplit=1)[0].strip()
            except:
                pass
            try:
                h = h.split(',', maxsplit=1)[0].strip()
            except:
                pass
            
            h = int(h)
            height.append(h)
        except:
            try:
                h = data.loc[i, 'DISPLAY_Resolution'].split(' ')[2].strip()
                h = int(h)
                height.append(h)
            except:
                # fail_to_split += 1
                height.append(np.nan)
        
        # if fail_to_split == 1:
        #     print(i, data.loc[i, 'DISPLAY_Resolution'])
        
    # Tach ra thanh 2 cot moi sau RESOLUTION
    cols = data.columns
    try:
        data.insert(cols.get_loc('DISPLAY_Resolution')+1, 'DISPLAY_Resolution_Width', width)
        data.insert(cols.get_loc('DISPLAY_Resolution')+2, 'DISPLAY_Resolution_Height', height)
        # bo cot DISPLAY_Resolution
        data.drop('DISPLAY_Resolution', axis=1, inplace=True)
        return True
    except:
        pass

    return False

def handleNull_ResHeight(data: pd.DataFrame):
    data['DISPLAY_Resolution_Height'].fillna(data['DISPLAY_Resolution_Height'].mode()[0], inplace=True)

def handleNull_ResWidth(data: pd.DataFrame):
    # Dien bang cot DISPLAY_Resolution_Height
    data['DISPLAY_Resolution_Width'].fillna(data['DISPLAY_Resolution_Height'], inplace=True)

def extract_display_resolution(data: pd.DataFrame, inplace: bool=False):
    if inplace == False:
        data = data.copy()
    
    handleNull_Res(data)
    if splitRes(data):
        handleNull_ResHeight(data)
        handleNull_ResWidth(data)
    return data

## Protection

In [None]:
def extract_display_protection(data: pd.DataFrame, inplace: bool=False):
    if inplace == False:
        data = data.copy()
    
    # drop DISPLAY_Protection
    data.drop('DISPLAY_Protection', axis=1, inplace=True)

    return data

## Combine

In [None]:
def extract_display(data: pd.DataFrame, inplace: bool=False):
    if inplace == False:
        data = data.copy()
    
    extract_display_type(data, inplace=True)
    extract_display_size(data, inplace=True)
    extract_display_resolution(data, inplace=True)
    extract_display_protection(data, inplace=True)

    return data

# PLATFORM SECTION

## OS

In [None]:
def handleNull_OS(data: pd.DataFrame):
    # fill na with other
    data['PLATFORM_OS'] = data['PLATFORM_OS'].fillna('Other')


def extract_OS(data: pd.DataFrame):
    # chay qua tung dong
    for i in range(len(data)):
        # lay gia tri cua PLATFORM_OS
        OS = data['PLATFORM_OS'][i]
        OS = OS.lower()
        # Neu OS khong phai la Other thi tach ra
        if "android" in OS:
            # Tiềm kiếm từ "Androidxxx" trong chuỗi bằng regex
            Actural_OS = re.findall('android\s?\d*', OS)[0]
            
            # replace
            data.loc[i, 'PLATFORM_OS'] = Actural_OS
        
        if "tizen" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'tizen'
        
        if "kaios" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'kaios'
        
        if "proprietary" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'proprietary'
        
        if "firefox" in OS:
            data.loc[i, 'PLATFORM_OS'] = 'firefox'
        
        if "bada" in OS:
            data.loc[i, "PLATFORM_OS"] = 'bada'
        
        if "palm" in OS:
            data.loc[i, "PLATFORM_OS"] = 'palm'


def extract_platform_os(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()
    
    handleNull_OS(data)
    extract_OS(data)
    
    return data

## CHIPSET

In [None]:
def extract_platform_chipset(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()
    
    # drop platform chipset
    data.drop('PLATFORM_Chipset', axis=1, inplace=True)

    return data

## CPU

In [None]:
def extract_platform_cpu(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()
    
    # drop platform cpu
    data.drop('PLATFORM_CPU', axis=1, inplace=True)

    return data

## GPU

In [None]:
def extract_platform_gpu(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()
    
    # drop platform gpu
    data.drop('PLATFORM_GPU', axis=1, inplace=True)

    return data

## COMBINE

In [None]:
def extract_platform(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()
    
    extract_platform_os(data, inplace=True)
    extract_platform_chipset(data, inplace=True)
    extract_platform_cpu(data, inplace=True)
    extract_platform_gpu(data, inplace=True)

    return data

# MEMORY SECTION

## Card Slot

In [None]:
def  unifyCardSlot(data: pd.DataFrame):
    data['MEMORY_Card_slot'] = data['MEMORY_Card_slot'].str.lower()
    for i in range(len(data['MEMORY_Card_slot'])):
        if 'microsdhc' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'microsdhc'
        
        elif 'microsdxc' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'microsdxc'
        
        elif 'microsd' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'microsd'
            
        elif 'mmc' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'mmc'
        
        elif 'minisd' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'minisd'

        elif 'memory stick micro' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'memory stick micro'

        elif 'memory stick duo' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'memory stick duo'

        elif 'nm (nano memory)' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'nano memory'

        elif 'sdio' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'sdio'
        
        elif 'sd' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'sd'

        elif 'no' in data['MEMORY_Card_slot'][i]:
            data.loc[i, 'MEMORY_Card_slot'] = 'no'
        
        else:
            data.loc[i, 'MEMORY_Card_slot'] = 'unspecified'


def extract_memory_cardslot(data: pd.DataFrame, inplace=False):
    if inplace == False:
        data = data.copy()

    unifyCardSlot(data)

    return data

## Internal

## Combine