# Problem statement:-
- Recently you entered in the mobile manufacturing market. Build a machine learning model that would help you to know the estimated price for your manufactured mobile phones on the basis of various key features and specifications.

In [1]:
# Importing libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('mobile_price_data.csv')
data.head()

Unnamed: 0,mobile_name,mobile_price,mobile_color,dual_sim,disp_size,resolution,os,num_cores,mp_speed,int_memory,ram,p_cam,f_cam,network,bluetooth,battery_power,mob_width,mob_height,mob_depth,mob_weight
0,"Realme Narzo 10A (So White, 32 GB)","₹8,999",So White,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
1,"Realme Narzo 10A (So Blue, 32 GB)","₹8,999",So Blue,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
2,"Realme Narzo 10A (So Blue, 64 GB)","₹9,999",So Blue,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,64 GB,4 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
3,"Realme Narzo 10A (So White, 64 GB)","₹9,999",So White,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,64 GB,4 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
4,"Realme Narzo 10 (That Green, 128 GB)","₹11,999",That Green,Dual Sim,16.51 cm (6.5 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,128 GB,4 GB,48MP + 8MP + 2MP + 2MP,16MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75.4 mm,164.4 mm,9 mm,199 g


In [3]:
data.shape

(459, 20)

# Observation : 
- There are 459 rows and 20 columns in dataset.
- Each row represent the information about different mobile.

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 459 entries, 0 to 458
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   mobile_name    459 non-null    object
 1   mobile_price   459 non-null    object
 2   mobile_color   459 non-null    object
 3   dual_sim       459 non-null    object
 4   disp_size      459 non-null    object
 5   resolution     459 non-null    object
 6   os             459 non-null    object
 7   num_cores      459 non-null    object
 8   mp_speed       459 non-null    object
 9   int_memory     459 non-null    object
 10  ram            459 non-null    object
 11  p_cam          459 non-null    object
 12  f_cam          459 non-null    object
 13  network        459 non-null    object
 14  bluetooth      459 non-null    object
 15  battery_power  459 non-null    object
 16  mob_width      459 non-null    object
 17  mob_height     459 non-null    object
 18  mob_depth      459 non-null   

In [5]:
# Changing column names including units in there column
col_name = {
    'mobile_price':'mobile_price_inr',
    'disp_size':'disp_size(cm/inch)',
    'resolution':'resolution(pixels)',
    'mp_speed':'mp_speed(GHz)',
    'int_memory':'int_memory(GB)',
    'ram':'ram(GB)',
    'p_cam':'p_cam(MP)',
    'f_cam':'f_cam(MP)',
    'battery_power':'battery_power(mAh)',
    'mob_width':'mob_width(mm)',
    'mob_height':'mob_height(mm)',
    'mob_depth':'mob_depth(mm)',
    'mob_weight':'mob_weight(g)'
    
}
data.rename(columns=col_name,inplace=True)

In [6]:
data.head(2)

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),ram(GB),p_cam(MP),f_cam(MP),network,bluetooth,battery_power(mAh),mob_width(mm),mob_height(mm),mob_depth(mm),mob_weight(g)
0,"Realme Narzo 10A (So White, 32 GB)","₹8,999",So White,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
1,"Realme Narzo 10A (So Blue, 32 GB)","₹8,999",So Blue,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g


In [7]:
data['mobile_price_inr']=data['mobile_price_inr'].replace(['₹',','],'',regex=True)

In [8]:
data['mobile_price_inr'] = data['mobile_price_inr'].astype('int64')

In [9]:
data.head()

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),ram(GB),p_cam(MP),f_cam(MP),network,bluetooth,battery_power(mAh),mob_width(mm),mob_height(mm),mob_depth(mm),mob_weight(g)
0,"Realme Narzo 10A (So White, 32 GB)",8999,So White,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
1,"Realme Narzo 10A (So Blue, 32 GB)",8999,So Blue,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
2,"Realme Narzo 10A (So Blue, 64 GB)",9999,So Blue,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,64 GB,4 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
3,"Realme Narzo 10A (So White, 64 GB)",9999,So White,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,64 GB,4 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
4,"Realme Narzo 10 (That Green, 128 GB)",11999,That Green,Dual Sim,16.51 cm (6.5 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,128 GB,4 GB,48MP + 8MP + 2MP + 2MP,16MP Front Camera,"4G VOLTE, 4G, 3G, 2G",Yes,5000 mAh,75.4 mm,164.4 mm,9 mm,199 g


In [10]:
data['bluetooth'].value_counts()

Yes    459
Name: bluetooth, dtype: int64

#### droping bluetooth column because it in not adding any value to dataset

In [11]:
data.drop('bluetooth',1,inplace=True)

In [12]:
# Now we have to fix the mobile_color 
data.mobile_color.nunique()

165

In [13]:
data[['dummy1','dummy2','mobile_color']] = data['mobile_color'].str.rpartition(' ')
data.drop(['dummy1','dummy2'],1,inplace=True)
data['mobile_color'].value_counts()


Blue         119
Black        115
Red           53
White         40
Gold          34
Purple        24
Green         21
Grey          14
Violet         5
Gray           4
Mint           3
Silver         3
Greener        3
Mist           2
Cyan           2
Sapphire       2
Brick          2
Concrete       2
Wave           2
Edition        2
Champagne      1
Orange         1
Brown          1
Sea            1
white          1
Ruby           1
gold           1
Name: mobile_color, dtype: int64

In 'mobile_name' column, only brand name should be kept because it doesn't have any role in pricing remaining feature is for that.

In [14]:
data[['mobile_name','dummy1','dummy2']] = data['mobile_name'].str.partition(' ')
data.drop(['dummy1','dummy2'],1,inplace=True)


In [15]:
data['mobile_name'].value_counts()

Vivo       96
Redmi      93
OPPO       86
Realme     79
Infinix    36
POCO       31
Mi         20
Samsung    17
MI3         1
Name: mobile_name, dtype: int64

In [16]:
data.head(2)

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),ram(GB),p_cam(MP),f_cam(MP),network,battery_power(mAh),mob_width(mm),mob_height(mm),mob_depth(mm),mob_weight(g)
0,Realme,8999,White,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g
1,Realme,8999,Blue,Dual Sim,16.56 cm (6.52 inch),1600 x 720 Pixels,Android 10,Octa Core,2 GHz,32 GB,3 GB,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g


In [17]:
data['disp_size(cm/inch)'] = data['disp_size(cm/inch)'].str.replace('\scm\s\(.+\)','',regex=True)

In [18]:
data['disp_size(cm/inch)'] = data['disp_size(cm/inch)'].astype('float')

In [19]:
data['disp_size(cm/inch)'].unique()

array([16.56, 16.51, 16.  , 16.26, 16.36, 16.76, 15.49, 16.69, 15.8 ,
       16.23, 16.94, 15.44, 15.21, 15.9 , 12.7 , 15.46, 13.97, 13.84,
       16.13, 16.21, 16.59, 16.28, 16.74, 14.48, 15.24, 13.21, 17.02,
       11.94, 15.06, 15.75, 15.72, 14.35, 15.19, 15.77, 15.7 , 15.82])

In [20]:
data['resolution(pixels)'] = data['resolution(pixels)'].str.replace('\s?[pP]ixel.*','',regex=True).replace('\s?[x*×]\s?','X',regex=True).replace('\$','',regex=True)

In [21]:
data['resolution(pixels)'].unique()

array(['1600X720', '2340X1080', '2400X1080', '1080X2340', '1080X2400',
       '720X1600', '1560X720', '1520X720', '2160X1080', '1440X720',
       '2280X1080', '1920X1080', '1280X720', '720X1544', '2316X1080',
       '960X540', '540X960', '1080X2160', '1500X720', '720X1440',
       '1080X1920', '720X1520', '720X1560', '720X1280'], dtype=object)

So, we have remove the units from resolution column.

In [22]:
data[['res1','sep','res2']] = data['resolution(pixels)'].str.partition('X')

Now sep is the column which contain X so we have to drop that column.

In [23]:
data.drop('sep',1,inplace=True)

In [24]:
# Changing the type of res1 and res2 
data[['res1','res2']].dtypes

res1    object
res2    object
dtype: object

In [25]:
data['res1'].unique(),data['res2'].unique()

(array(['1600', '2340', '2400', '1080', '720', '1560', '1520', '2160',
        '1440', '2280', '1920', '1280', '2316', '960', '540', '1500'],
       dtype=object),
 array(['720', '1080', '2340', '2400', '1600', '1544', '540', '960',
        '2160', '1440', '1920', '1520', '1560', '1280'], dtype=object))

In [26]:
data['res1'] = data['res1'].astype('int64')
data['res2'] = data['res2'].astype('int64')


In [27]:
# Changing the value of resolution column to res1*res2
data['resolution(pixels)'] = data['res1']*data['res2']

In [28]:
data.head(2)

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),...,p_cam(MP),f_cam(MP),network,battery_power(mAh),mob_width(mm),mob_height(mm),mob_depth(mm),mob_weight(g),res1,res2
0,Realme,8999,White,Dual Sim,16.56,1152000,Android 10,Octa Core,2 GHz,32 GB,...,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g,1600,720
1,Realme,8999,Blue,Dual Sim,16.56,1152000,Android 10,Octa Core,2 GHz,32 GB,...,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g,1600,720


Now we have to change mp_speed(GHz). We have to remove the GHz from the data and convert its type to int.

In [29]:
data['mp_speed(GHz)'].unique()

array(['2 GHz', '2.3 GHz', '2.96 GHz', '1.95 GHz', '2.8 GHz', '2.05 GHz',
       '2.2 GHz', '2.1 GHz', '2.84 GHz', '2.5 GHz', '1.8 GHz', '1.4 GHz',
       '1.5 GHz', '1.3 GHz', '1.6 GHz', '2.39 GHz', '2.6 GHz',
       '2.649 GHz'], dtype=object)

In [30]:
data[['mp_speed(GHz)','dummy1','dummy2']] = data['mp_speed(GHz)'].str.partition(" ")

We have sepearted mp_speed column into mp_speed,dummy1,and dummy2. dummy1 and dummy2 are of no use so we have to drop it.

In [31]:
data.drop(['dummy1','dummy2'],1,inplace=True)

In [32]:
# Now we have to change its datatype to int
data['mp_speed(GHz)']=data['mp_speed(GHz)'].astype('float')

In [33]:
data.head(2)

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),...,p_cam(MP),f_cam(MP),network,battery_power(mAh),mob_width(mm),mob_height(mm),mob_depth(mm),mob_weight(g),res1,res2
0,Realme,8999,White,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,32 GB,...,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g,1600,720
1,Realme,8999,Blue,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,32 GB,...,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g,1600,720


In [34]:
data['int_memory(GB)'].unique()

array(['32 GB', '64 GB', '128 GB', '256 GB', '16 GB', '512 GB', '8 GB'],
      dtype=object)

We have to remove GB from the data and change its type to int.

In [35]:
data[['int_memory(GB)','dummy1','dummy2']] = data['int_memory(GB)'].str.partition(" ")

Since we have no use of dummy1 and dummy2 so remove it from data.

In [36]:
data.drop(['dummy1','dummy2'],1,inplace=True)

In [37]:
# Changing data types to int 
data['int_memory(GB)']=data['int_memory(GB)'].astype('int')

In [38]:
data.head(2)

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),...,p_cam(MP),f_cam(MP),network,battery_power(mAh),mob_width(mm),mob_height(mm),mob_depth(mm),mob_weight(g),res1,res2
0,Realme,8999,White,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,32,...,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g,1600,720
1,Realme,8999,Blue,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,32,...,12MP + 2MP + 2MP,5MP Front Camera,"4G VOLTE, 4G, 3G, 2G",5000 mAh,75 mm,164.4 mm,8.95 mm,195 g,1600,720


In [39]:
data['battery_power(mAh)'].unique()

array(['5000 mAh', '4035 mAh', '4000 mAh', '3500 mAh', '4200 mAh',
       '4300 mAh', '4230 mAh', '5020 mAh', '4030 mAh', '3400 mAh',
       '3080 mAh', '3010 mAh', '4780 mAh', '3050 mAh', '4100 mAh',
       '3000 mAh', '4500 mAh', '3700 mAh', '3260 mAh', '3360 mAh',
       '3225 mAh', '2730 mAh', '6000 mAh', '2600 mAh', '2000 mAh',
       '3600 mAh', '4350 mAh', '4025 mAh', '4065 mAh', '3730 mAh',
       '3180 mAh', '4020 mAh', '3075 mAh', '3200 mAh'], dtype=object)

In [40]:
def cleanig_n_converting(data,col):
    data[[col,'dummy1','dummy2']] = data[col].str.partition(" ")
    data.drop(['dummy1','dummy2'],1,inplace=True)
    data[col]=data[col].astype(float)
        

In [41]:
# Cleaning and converting datatype of column 'battery_power(mAh)'
cleanig_n_converting(data,'battery_power(mAh)')

In [42]:
# Cleaning and converting datatype of column 'mob_width(mm)'
cleanig_n_converting(data,'mob_width(mm)')


In [43]:
# Cleaning and converting datatype of column 'mob_height(mm)'
cleanig_n_converting(data,'mob_height(mm)')

In [44]:
# Cleaning and converting datatype of column 'mob_depth(mm)'
cleanig_n_converting(data,'mob_depth(mm)')

In [45]:
# Cleaning and converting datatype of column 'mob_weight(g)'
cleanig_n_converting(data,'mob_weight(g)')

In [46]:
# Cleaning and converting datatype of column 'mob_weight(g)'
cleanig_n_converting(data,'ram(GB)')

In [47]:
data.columns

Index(['mobile_name', 'mobile_price_inr', 'mobile_color', 'dual_sim',
       'disp_size(cm/inch)', 'resolution(pixels)', 'os', 'num_cores',
       'mp_speed(GHz)', 'int_memory(GB)', 'ram(GB)', 'p_cam(MP)', 'f_cam(MP)',
       'network', 'battery_power(mAh)', 'mob_width(mm)', 'mob_height(mm)',
       'mob_depth(mm)', 'mob_weight(g)', 'res1', 'res2'],
      dtype='object')

In [48]:
data['p_cam(MP)'].head()

0          12MP + 2MP + 2MP
1          12MP + 2MP + 2MP
2          12MP + 2MP + 2MP
3          12MP + 2MP + 2MP
4    48MP + 8MP + 2MP + 2MP
Name: p_cam(MP), dtype: object

In [49]:
data['f_cam(MP)'].unique()

array(['5MP Front Camera', '16MP Front Camera',
       '32MP + 8MP Dual Front Camera', '16MP + 8MP Dual Front Camera',
       '13MP Front Camera', '32MP Front Camera', '8MP Front Camera',
       '20MP Front Camera', '20MP + 2MP Dual Front Camera',
       '2MP Front Camera', '8MP Dual Front Camera', '25MP Front Camera',
       '24MP Front Camera', '13MP + 2MP Dual Front Camera',
       '44MP + 2MP Dual Front Camera'], dtype=object)

In [50]:
data['network'].head()

0    4G VOLTE, 4G, 3G, 2G
1    4G VOLTE, 4G, 3G, 2G
2    4G VOLTE, 4G, 3G, 2G
3    4G VOLTE, 4G, 3G, 2G
4    4G VOLTE, 4G, 3G, 2G
Name: network, dtype: object

In 'network' feature, the network bandwidth mobile work on so it is also cost determining factor. Hence we will try to use One Hot Encode on it.
1. First we are removing extra spaces "", If not removed it might create duplicate column.

In [51]:
data['network'] = data['network'].str.replace(" ","")

- Since the values in the str formate are jumbled, we will convert it into list by spliting and then sorting it.

In [52]:
data['network'].head()

0    4GVOLTE,4G,3G,2G
1    4GVOLTE,4G,3G,2G
2    4GVOLTE,4G,3G,2G
3    4GVOLTE,4G,3G,2G
4    4GVOLTE,4G,3G,2G
Name: network, dtype: object

In [53]:
data['network'] = data['network'].apply(lambda x: sorted(x.split(',')))


In [54]:
data['network'].value_counts()

[2G, 3G, 4G, 4GVOLTE]        238
[2G, 3G, 4GVOLTE]             99
[2G, 3G, 4G]                  68
[3G, 4GVOLTE]                 18
[3G, 4G, 4GVOLTE]             13
[4GVOLTE]                      8
[4GVOLTE, 5G]                  4
[2G, 3G, 4G, 4GVOLTE, 5G]      3
[4G, 4GVOLTE]                  3
[2G]                           2
[3G, 4G]                       2
[3G]                           1
Name: network, dtype: int64

- Now we use MultiLabelBinarizer to convert data['network'] column

In [55]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb

In [56]:
data_network = mlb.fit_transform(data['network'])
data_network

array([[1, 1, 1, 1, 0],
       [1, 1, 1, 1, 0],
       [1, 1, 1, 1, 0],
       ...,
       [1, 1, 1, 0, 0],
       [1, 1, 1, 1, 0],
       [1, 1, 1, 1, 0]])

In [57]:
data_network = pd.DataFrame(data_network,columns = mlb.classes_,index=data.index)

In [58]:
data_network

Unnamed: 0,2G,3G,4G,4GVOLTE,5G
0,1,1,1,1,0
1,1,1,1,1,0
2,1,1,1,1,0
3,1,1,1,1,0
4,1,1,1,1,0
...,...,...,...,...,...
454,1,1,0,1,0
455,1,1,1,1,0
456,1,1,1,0,0
457,1,1,1,1,0


#### Merging newly formed data_network with existing data

In [59]:
data = pd.concat([data,data_network],axis=1)

In [60]:
# Now droping the network column from the dataset
data.drop('network',1,inplace=True)

In [61]:
data.head()

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,dual_sim,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),...,mob_height(mm),mob_depth(mm),mob_weight(g),res1,res2,2G,3G,4G,4GVOLTE,5G
0,Realme,8999,White,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,32,...,164.4,8.95,195.0,1600,720,1,1,1,1,0
1,Realme,8999,Blue,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,32,...,164.4,8.95,195.0,1600,720,1,1,1,1,0
2,Realme,9999,Blue,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,64,...,164.4,8.95,195.0,1600,720,1,1,1,1,0
3,Realme,9999,White,Dual Sim,16.56,1152000,Android 10,Octa Core,2.0,64,...,164.4,8.95,195.0,1600,720,1,1,1,1,0
4,Realme,11999,Green,Dual Sim,16.51,1152000,Android 10,Octa Core,2.0,128,...,164.4,9.0,199.0,1600,720,1,1,1,1,0


In [62]:
data['dual_sim'].value_counts()

Dual Sim      458
Single Sim      1
Name: dual_sim, dtype: int64

#### Droping dual_sim column from the dataset because it doesn't add any value to our analysis.

In [63]:
data.drop('dual_sim',1,inplace=True)

In [64]:
data.columns

Index(['mobile_name', 'mobile_price_inr', 'mobile_color', 'disp_size(cm/inch)',
       'resolution(pixels)', 'os', 'num_cores', 'mp_speed(GHz)',
       'int_memory(GB)', 'ram(GB)', 'p_cam(MP)', 'f_cam(MP)',
       'battery_power(mAh)', 'mob_width(mm)', 'mob_height(mm)',
       'mob_depth(mm)', 'mob_weight(g)', 'res1', 'res2', '2G', '3G', '4G',
       '4GVOLTE', '5G'],
      dtype='object')

In [65]:
data['p_cam(MP)']

0            12MP + 2MP + 2MP
1            12MP + 2MP + 2MP
2            12MP + 2MP + 2MP
3            12MP + 2MP + 2MP
4      48MP + 8MP + 2MP + 2MP
                ...          
454          13MP Rear Camera
455          13MP Rear Camera
456          16MP Rear Camera
457                48MP + 5MP
458                48MP + 5MP
Name: p_cam(MP), Length: 459, dtype: object

In [66]:
data['p_cam_max'] = [x[0:2].replace("M","") for x in data['p_cam(MP)']]
data['p_cam_count'] = [x.count('MP') for x in data['p_cam(MP)']]

In [67]:
data['f_cam(MP)']

0       5MP Front Camera
1       5MP Front Camera
2       5MP Front Camera
3       5MP Front Camera
4      16MP Front Camera
             ...        
454     8MP Front Camera
455     8MP Front Camera
456    25MP Front Camera
457    16MP Front Camera
458    16MP Front Camera
Name: f_cam(MP), Length: 459, dtype: object

In [68]:
data['f_cam_max'] = [x[0:2].replace("M","") for x in data['f_cam(MP)']]
data['f_cam_count'] = [x.count('MP') for x in data['f_cam(MP)']]

#### Changing datatype fo f_cam_max and p_cam_max 

In [69]:
data[['f_cam_max','p_cam_max']] = data[['f_cam_max','p_cam_max']].astype('int')

#### droping f_cam(MP) and p_cam(MP) from dataset.

In [70]:
data.drop(['f_cam(MP)','p_cam(MP)','res1','res2'],1,inplace=True)

In [71]:
data['os']=data['os'].str.replace('^Android\s[a-zA-Z]*\s?',"",regex=True)

In [72]:
data.os.unique()

array(['10', '9.0', '9', '8.1', '7.1.1', '8.0', '7.1.2', '4.4.2', '6.0.1',
       '7.1', '6', '7', '5.4.1', '8.1.0', '8', '5.1', '5.1.1'],
      dtype=object)

Now, we need to remove expected element from the os column

In [73]:
data[['os','hoax1','hoax2']]=data['os'].str.partition('.')

Above we created hoax1 and hoax2 which is unwanted columns so we need to drop them.

In [74]:
data.drop(['hoax1','hoax2'],axis=1,inplace=True)

In [75]:
data['os'] = data['os'].astype('int')

In [76]:
data.os.value_counts()

9     208
8     122
10     84
7      34
6       5
5       5
4       1
Name: os, dtype: int64

In [77]:
data['num_cores'].unique()

array(['Octa Core', 'Quad Core', 'Single Core'], dtype=object)

In [78]:
dic = {
    'Octa Core':8,
    'Quad Core':4,
    'Single Core':1
}
dic

{'Octa Core': 8, 'Quad Core': 4, 'Single Core': 1}

In [79]:
data['num_cores']=data['num_cores'].replace(dic)

In [80]:
data.head()

Unnamed: 0,mobile_name,mobile_price_inr,mobile_color,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),ram(GB),...,mob_weight(g),2G,3G,4G,4GVOLTE,5G,p_cam_max,p_cam_count,f_cam_max,f_cam_count
0,Realme,8999,White,16.56,1152000,10,8,2.0,32,3.0,...,195.0,1,1,1,1,0,12,3,5,1
1,Realme,8999,Blue,16.56,1152000,10,8,2.0,32,3.0,...,195.0,1,1,1,1,0,12,3,5,1
2,Realme,9999,Blue,16.56,1152000,10,8,2.0,64,4.0,...,195.0,1,1,1,1,0,12,3,5,1
3,Realme,9999,White,16.56,1152000,10,8,2.0,64,4.0,...,195.0,1,1,1,1,0,12,3,5,1
4,Realme,11999,Green,16.51,1152000,10,8,2.0,128,4.0,...,199.0,1,1,1,1,0,48,4,16,1


#### Now we are done with the datacleaning part our dataset is cleanded.

In [81]:
data['mobile_name'].value_counts()

Vivo       96
Redmi      93
OPPO       86
Realme     79
Infinix    36
POCO       31
Mi         20
Samsung    17
MI3         1
Name: mobile_name, dtype: int64

From above we can observe that the most no. of saled mobile is of Vivo company followed by Redmi and OPPO.

### Performing One Hot Encoding in mobile_name, and mobile_color column

In [82]:
data[['mobile_name','mobile_color']].nunique()

mobile_name      9
mobile_color    27
dtype: int64

In [83]:
# ohe for mobile_name
data_mobile_name = pd.get_dummies(data['mobile_name'])
data_mobile_name

Unnamed: 0,Infinix,MI3,Mi,OPPO,POCO,Realme,Redmi,Samsung,Vivo
0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...
454,0,0,0,1,0,0,0,0,0
455,0,0,0,1,0,0,0,0,0
456,0,0,0,1,0,0,0,0,0
457,0,0,0,1,0,0,0,0,0


In [84]:
# ohe for mobile_color 
data_color = pd.get_dummies(data['mobile_color'])
data_color

Unnamed: 0,Black,Blue,Brick,Brown,Champagne,Concrete,Cyan,Edition,Gold,Gray,...,Red,Ruby,Sapphire,Sea,Silver,Violet,Wave,White,gold,white
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
454,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
455,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
456,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
457,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Now merging both into orignal data 

In [85]:
data = pd.concat([data,data_mobile_name,data_color],axis=1)

#### droping mobile_name and mobile_color from the dataset

In [86]:
data_cat = ['mobile_name', 'mobile_color']
data.drop(data_cat,1,inplace=True)

### We have cleaned and preprocessed the dataset now we are ready to go.

# Sepearting x and y from dataset

In [87]:
x = data.drop('mobile_price_inr',1)
y = data['mobile_price_inr']

# Spliting x and y into train and test set

In [88]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.25,random_state=42)

# Applying linear Regression to x_train and y_train

In [89]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [90]:
lr.fit(x_train,y_train)

# Predicting value of x_test

In [91]:
y_pred = lr.predict(x_test)
y_pred

array([15121.07516696,  7499.00000002, 23719.49201789, 16647.39972956,
       12195.72395092, 18994.32000619, 13685.53189924, 17280.83516531,
       17933.34797053, 10405.42654304, 11450.6446476 , 21184.61906853,
       11245.43325255, 11073.43536605, 14792.13738155, 13906.53238472,
       17018.95205186, 12006.36984249, 34022.88024407,  6577.85919369,
       36180.10127413, 17492.30941809, 17838.61099738, 22120.41939367,
       19455.19904789, 11440.02809815,  9035.3520687 , 14835.71295837,
       14070.64234211, 16907.60433276, 21117.69285861, 26696.3159681 ,
       31590.00000009, 18676.59282032,  9240.56346374,  9508.17794485,
       19040.66938089,  9943.12128733, 11244.59674658,  8081.28899446,
       17904.57178602, 23445.03802976, 17592.38225113,  8488.47050517,
       12006.36984249, 20191.76011016,  9256.35255417, 13345.83266396,
       21093.32019685, 11466.43373802,  4969.12772997, 17371.38176566,
       15685.97624418, 16491.28208364, 11249.32952873, 21439.11441956,
      

In [92]:
x_test

Unnamed: 0,disp_size(cm/inch),resolution(pixels),os,num_cores,mp_speed(GHz),int_memory(GB),ram(GB),battery_power(mAh),mob_width(mm),mob_height(mm),...,Red,Ruby,Sapphire,Sea,Silver,Violet,Wave,White,gold,white
124,16.00,2462400,9,8,2.0,64,4.0,4000.0,8.35,158.30,...,0,0,0,0,0,0,0,0,0,0
30,15.49,1123200,9,8,2.0,32,3.0,4000.0,73.70,154.30,...,0,0,1,0,0,0,0,0,0,0
199,16.21,2527200,9,8,2.3,128,8.0,4500.0,75.23,159.53,...,0,0,0,0,0,0,0,0,0,0
438,13.97,2073600,6,8,1.5,64,4.0,3200.0,75.20,153.30,...,0,0,0,0,0,0,0,0,0,0
154,16.00,2527200,9,8,2.2,64,4.0,4000.0,75.21,159.21,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,15.80,1094400,9,8,2.0,32,3.0,5000.0,9.00,175.00,...,0,0,0,0,0,0,0,0,0,0
234,16.23,2527200,9,8,2.0,128,8.0,3700.0,74.71,157.25,...,0,0,0,0,0,0,0,0,0,0
238,16.00,2462400,8,8,1.8,32,4.0,3260.0,75.03,154.81,...,0,0,0,0,0,0,0,0,0,0
104,16.23,2527200,9,8,2.2,128,6.0,4000.0,74.30,156.70,...,0,0,0,0,0,0,0,0,0,0


# Checking accuracy of our model

In [93]:
from sklearn.metrics import r2_score
r2_score(y_pred,y_test)

0.759231635915598

In [94]:
# value of m
lr.coef_

array([ 1.45070570e+03,  1.72434138e-04, -1.21640458e+03, -4.13850428e+02,
        5.48730900e+03,  4.28148273e+01,  8.40006710e+02, -2.83744438e+00,
       -1.41241656e+02, -9.84133141e+01, -1.13130276e+02,  5.35847318e+01,
       -1.58804825e+03,  9.91385939e+02, -1.29236677e+03, -5.33298428e+02,
        2.72743726e+04,  3.91165867e-01,  6.91091873e+02,  8.77490924e+01,
        1.11478115e+03, -6.68957835e+01, -3.70239186e+03, -2.34379099e+03,
        2.62605037e+03, -2.90796672e+03, -4.30223456e+02, -1.59324839e+03,
        6.20803393e+03,  2.21043289e+03,  8.45418785e+02,  8.44582279e+02,
        2.58873612e+03, -4.05521304e+03, -4.03588274e-12,  2.58873612e+03,
       -4.17148625e+03, -2.92119245e+03,  1.91110420e+03,  1.03334584e+03,
       -6.69090919e+02,  2.91289676e+03,  1.10579836e+03, -1.84343558e+03,
        5.79671864e+03, -9.09494702e-13, -1.61442074e+01,  1.06641927e+03,
        4.11316341e+02,  4.11316341e+02, -5.65955308e+03,  2.75882561e+03,
       -1.35755788e+03,  

In [95]:
# value of c
lr.intercept_

12739.065680859621

In [96]:
# Checking error in our model
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_pred,y_test))

4088.1564095761787