In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('smartphones.csv')

In [None]:
data = df.copy()

In [None]:
data.head()

### column(model)

In [None]:
## column(model): unnecessary information in model like (5gb,128gb)
data['model'] = data['model'].str.replace('5G','')
data ['model'] = data['model'].str.split('(').str[0].str.strip()

In [None]:
## extract brand name
data.insert(1,'brand_name',data['model'].str.split(' ').str[0])

## column(price)

In [None]:
## remove comma and ruppes symbol
data['price'] = data['price'].str.replace(',','').str.replace('₹','')

## column(rating)

In [None]:
## The skewness is -0.699, which indicates that the distribution of the ratings is slightly negatively skewed. it is
## better to fill null values with median

In [None]:
data['rating'].isnull().sum()

In [None]:
data['rating'].describe()

In [None]:
sns.distplot(data['rating'], kde=True)

In [None]:
data['rating'] = data['rating'].fillna(data['rating'].median())

## column(sim)

In [None]:
## create three more columns -- 5g,nfc,ir_blaster
data['has_5g'] = data['sim'].str.contains('5G', case=False)
data['has_ir_blaster'] = data['sim'].str.contains('IR Blaster',case=False)

## column(processor)

In [None]:
## create two new columns [processor_name , num_of_cores]
data['num_of_core'] = data['processor'].str.extract(r'(\b\w+\s+Core\b)')[0]

In [None]:
processor_names = data['processor_name'].unique()
unique_processor_names = []

for i in processor_names:
    new = i.split(' ')[0] 
    if new not in unique_processor_names: 
        unique_processor_names.append(new)  
unique_processor_names
processor_brands = ['Snapdragon', 'Exynos', 'Dimensity', 'Bionic', 'Helio', 'Kirin', 'Unisoc', 'Apple', 'MediaTek', 'Qualcomm','Google']

In [None]:
import numpy as np
def process_brand(processor_name):
    for brand in processor_brands:
        if brand in processor_name:
            return brand
    return np.nan
data['processor_brand'] = data['processor'].apply(process_brand)
data[['processor', 'processor_brand']].head()

In [None]:
data['processor_brand'].unique()

In [None]:
data['num_of_core'].unique()

## column(ram)

In [None]:
## create two more columns(ram_capacity,space)

In [None]:
unique_rams = []
for i in data.ram:
    ram = i.replace('\u2009', ' ')
    result = ram.split(' ')[0]
    try:
        ram_value = int(result)
        if ram_value < 15 and ram_value not in unique_rams:
            unique_rams.append(ram_value)
    except ValueError:
        continue 
unique_rams

In [None]:
def ram_cap(ram):
    for i in unique_rams:
        if str(i) in ram:
            return i
    return np.nan
data['ram_capacity'] = data['ram'].apply(ram_cap)

In [None]:
data['ram_capacity'].head()

In [None]:
unique_storage = []
for i in data.ram:
    storage = i.replace('\u2009', ' ')
    result = storage.split(',')
    for item in result:
        item = item.strip()
        try:
            storage_value = int(item.split(' ')[0]) 
            if storage_value < 600:  
                unique_storage.append(storage_value)
        except ValueError:
            continue  
unique_storage = list(set(unique_storage)) 
unique_storage


In [None]:
def uni_storage(storage):
    for i in unique_storage:
        if str(i) in storage:
            return i
    else:
        return np.nan
data['storage_capacity'] = data['ram'].apply(uni_storage)

In [None]:
data['storage_capacity'].unique()

### column(battery)

In [None]:
battery = []
for i in data.battery:
    mah = i.replace('\u2009', ' ')
    try:
        res = int(mah.split(' ')[0])
        if res > 1000:
            battery.append(res)
    except ValueError:
            continue  
battery = list(set(battery))
battery

In [194]:
def mah_cap(bat):
    for i in battery:
        if str(i) in bat:
            return i
    else:
        return np.nan
data['mah_capacity'] = data['battery'].apply(mah_cap)

## column(display)

In [None]:
## add new columns screen_size and resolution
li = []
for i in data.display:
    res = i.split(' ')[0]
    try :
        if float(res) < 10:
            li.append(res)
    except ValueError:
            continue 

In [207]:
def screen(screen):
    for i in li:
        if str(i) in screen:
            return i
    else:
        return np.nan
data['screen_size'] = data['display'].apply(screen)

In [307]:
# resolution column
resolution =  []
for i in data.display:
    i = i.replace('\u2009', ' ').strip()
    try:
        split = i.split(',')[1].strip()
        if 'x' in split:
            px_pos = split.find('px')
            resolution.append(split[:px_pos].strip())
        else:
            continue
    except IndexError:
        continue
resolution = list(set(resolution))

In [None]:
def screen_res(res):
    res = res.replace('\u2009', ' ').strip()  
    for i in resolution:
        if i in res:
            return i
    return np.nan
data['screen_reosolution'] = data['display'].apply(lambda x: screen_res(x))
data[['display', 'screen_reosolution']]

## camera

In [None]:
def extract_rear_camera(camera_str):
    if isinstance(camera_str, str):
        cleaned_str = camera_str.replace('\u2009', ' ')
        pos = cleaned_str.find('&')
        if pos != -1:
            return cleaned_str[:pos].strip()
        else:
            return cleaned_str.strip()
    else:
        return None

data['rear_camera'] = data['camera'].apply(extract_rear_camera)
data.head()

In [345]:
def front_camera(front_str):
    if isinstance(front_str,str):
        cleaned_str = front_str.replace('\u2009', ' ')
        pos = cleaned_str.find('&')
        if pos != -1:
            return cleaned_str[pos:]
        else:
            return None
    else:
        pass
data['front_camera'] = data['camera'].apply(front_camera)
data['front_camera'] = data['front_camera'].str.replace('& ','')
data['front_camera'] = data['front_camera'].str.replace('MP Front Camera','')

## card column

In [359]:
def card_support(card):
    if isinstance(card, str):
        if 'Memory Card Supported' in card:
            return True
        else:
            return False
    else:
        return False
data['memory_card_supported'] = data['card'].apply(card_support)

##  dropping columns

In [None]:
data = data.drop(columns=['battery', 'camera', 'ram', 'display', 'processor', 'sim'])

In [369]:
data = data.drop(columns='card')

## camera column

In [372]:
data['rear_camera'] = data['rear_camera'].str.replace('MP Triple Rear','').str.replace('MP ','')

In [374]:
data['rear_camera'] = data['rear_camera'].str.replace('Quad Rear','').str.replace('Dual Rear','').str.replace('Rear','')

In [None]:
data.head(10)

### rearanging the columns

In [380]:
columns = data.columns.to_list()
order = ['model','brand_name','price','rating','has_5g','has_ir_blaster','os','processor_name','processor_brand'
        ,'num_of_core','ram_capacity','storage_capacity','mah_capacity','screen_size','screen_reosolution','rear_camera',
        'front_camera','memory_card_supported']
data = data[order]

## os column

In [387]:
def os_name(name):
    if isinstance(name, str): 
        if 'Android' in name:
            return 'android'
        elif 'iOS' in name:
            return 'ios'
        elif 'Not Specified' in name:
            return name
        elif 'Harmony' in name or 'Hongmeng' in name:
            return 'other'
        else:
            return np.nan
    else:
        return np.nan  
data['os'] = data['os'].apply(os_name)

In [389]:
data.head(20)

Unnamed: 0,model,brand_name,price,rating,has_5g,has_ir_blaster,os,processor_name,processor_brand,num_of_core,ram_capacity,storage_capacity,mah_capacity,screen_size,screen_reosolution,rear_camera,front_camera,memory_card_supported
0,OnePlus 11,OnePlus,54999,89.0,True,False,android,Snapdragon 8 Gen2,Snapdragon,Octa Core,12.0,256.0,5000.0,6.7,1440 x 3216,50 + 48 + 32,16,False
1,OnePlus Nord CE 2 Lite,OnePlus,19989,81.0,True,False,android,Snapdragon 695,Snapdragon,Octa Core,12.0,128.0,5000.0,6.59,1080 x 2412,64 + 2 + 2,16,False
2,Samsung Galaxy A14,Samsung,16499,75.0,True,False,android,Exynos 1330,Exynos,Octa Core,6.0,4.0,5000.0,6.6,1080 x 2408,50 + 2 + 2,13,True
3,Motorola Moto G62,Motorola,14999,81.0,True,False,android,Snapdragon 695,Snapdragon,Octa Core,12.0,128.0,5000.0,6.55,1080 x 2400,50 + 8 + 2,16,False
4,Realme 10 Pro Plus,Realme,24999,82.0,True,False,android,Dimensity 1080,Dimensity,Octa Core,12.0,128.0,5000.0,6.7,1080 x 2412,108 + 8 + 2,16,False
5,Samsung Galaxy F23,Samsung,16999,80.0,True,False,android,Snapdragon 750G,Snapdragon,Octa Core,12.0,128.0,5000.0,6.6,1080 x 2408,50 + 8 + 2,8,True
6,Apple iPhone 14,Apple,65999,81.0,True,False,ios,Bionic A15,Bionic,Hexa Core,12.0,128.0,3279.0,6.1,1170 x 2532,12 + 12,12,False
7,Xiaomi Redmi Note 12 Pro Plus,Xiaomi,29999,86.0,True,True,android,Dimensity 1080,Dimensity,Octa Core,6.0,256.0,4980.0,6.6,1080 x 2400,200 + 8 + 2,16,False
8,Nothing Phone 1,Nothing,26749,85.0,True,False,,Snapdragon 778G Plus,Snapdragon,Octa Core,12.0,128.0,4500.0,6.55,1080 x 2400,50 + 50,16,False
9,OnePlus Nord 2T,OnePlus,28999,84.0,True,False,,Dimensity 1300,Dimensity,Octa Core,12.0,128.0,4500.0,6.43,1080 x 2400,50 + 8 + 2,32,False


In [391]:
data.shape

(1020, 18)