In [107]:
import regex as re
from matplotlib import pyplot as plt
from sklearn import datasets
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [108]:
data = pd.read_csv('/content/Mobile phone price - Mobile phone price.csv')
print(data)

       Brand              Model Storage    RAM Screen Size (inches)  \
0      Apple      iPhone 13 Pro  128 GB   6 GB                  6.1   
1    Samsung   Galaxy S21 Ultra  256 GB  12 GB                  6.8   
2    OnePlus              9 Pro  128 GB   8 GB                  6.7   
3     Xiaomi  Redmi Note 10 Pro  128 GB   6 GB                 6.67   
4     Google            Pixel 6  128 GB   8 GB                  6.4   
..       ...                ...     ...    ...                  ...   
402  Samsung   Galaxy Note20 5G     128      8                  6.7   
403   Xiaomi      Mi 10 Lite 5G     128      6                 6.57   
404    Apple  iPhone 12 Pro Max     128      6                  6.7   
405     Oppo              Reno3     128      8                  6.4   
406  Samsung    Galaxy S10 Lite     128      6                  6.7   

            Camera (MP)  Battery Capacity (mAh) Price ($)  
0          12 + 12 + 12                    3095       999  
1    108 + 10 + 10 + 12    

In [109]:
# renaming columns to more usable names
data = data.rename({
    'Brand': 'brand',
    'Model':'model',
    'Storage':'storage',
    'RAM':'ram',
    'Screen Size (inches)':'screen_size',
    'Camera (MP)':'camera',
    'Battery Capacity (mAh)':'battery',
    'Price ($)':'price'
}, axis=1)

data.head()

Unnamed: 0,brand,model,storage,ram,screen_size,camera,battery,price
0,Apple,iPhone 13 Pro,128 GB,6 GB,6.1,12 + 12 + 12,3095,999
1,Samsung,Galaxy S21 Ultra,256 GB,12 GB,6.8,108 + 10 + 10 + 12,5000,1199
2,OnePlus,9 Pro,128 GB,8 GB,6.7,48 + 50 + 8 + 2,4500,899
3,Xiaomi,Redmi Note 10 Pro,128 GB,6 GB,6.67,64 + 8 + 5 + 2,5020,279
4,Google,Pixel 6,128 GB,8 GB,6.4,50 + 12.2,4614,799


In [110]:
# cleaning brand and model names to more general format

data['brand'] = data['brand'].str.lower()
data['model'] = data['model'].str.lower()

In [111]:
# removing string information from storage and ram sizes

data['storage'] = data['storage'].str.replace('GB','')
data['ram'] = data['ram'].str.replace('GB','')

In [112]:
# new feature showing number of cameras

data['n_cameras'] = data['camera'].str.count('\\+') + 1

In [113]:
# four new features each column holding camera resolution

res1 = []
res2 = []
res3 = []
res4 = []

for x in data['camera']:
  resolutions = x.split('+')
  tam = len(resolutions)

  if tam == 1:
    res1.append(resolutions[0])
    res2.append('0')
    res3.append('0')
    res4.append('0')
    
  if tam == 2:
    res1.append(resolutions[0])
    res2.append(resolutions[1])
    res3.append('0')
    res4.append('0')
    
  if tam == 3:
    res1.append(resolutions[0])
    res2.append(resolutions[1])
    res3.append(resolutions[2])
    res4.append('0')
    
  if tam == 4:
    res1.append(resolutions[0])
    res2.append(resolutions[1])
    res3.append(resolutions[2])
    res4.append(resolutions[3])

data['res1'] = res1
data['res2'] = res2
data['res3'] = res3
data['res4'] = res4

data = data.drop(columns='camera')

In [114]:
data.head()

Unnamed: 0,brand,model,storage,ram,screen_size,battery,price,n_cameras,res1,res2,res3,res4
0,apple,iphone 13 pro,128,6,6.1,3095,999,3,12,12.0,12,0
1,samsung,galaxy s21 ultra,256,12,6.8,5000,1199,4,108,10.0,10,12
2,oneplus,9 pro,128,8,6.7,4500,899,4,48,50.0,8,2
3,xiaomi,redmi note 10 pro,128,6,6.67,5020,279,4,64,8.0,5,2
4,google,pixel 6,128,8,6.4,4614,799,2,50,12.2,0,0


In [115]:
# data cleaning

data['res1'] = data['res1'].str.replace('MP','')
data['res2'] = data['res2'].str.replace('MP','')
data['res3'] = data['res3'].str.replace('MP','')
data['res4'] = data['res4'].str.replace('MP','')
data['price'] = data['price'].str.replace('$','')
data['price'] = data['price'].str.replace(',','.')
data.loc[88,'screen_size'] = '6.8'
data.loc[373,'screen_size'] = '7.6'
data.loc[342,'res4'] = 0
data.loc[342,'n_cameras'] = 3
data.loc[292,'res4'] = 0
data.loc[292,'n_cameras'] = 3
data.loc[312,'res4'] = 2
data.loc[312,'n_cameras'] = 4
data.loc[330,'res4'] = 0
data.loc[330,'n_cameras'] = 3
data.loc[361,'res4'] = 8
data.loc[361,'res3'] = 8
data.loc[361,'n_cameras'] = 4
data.loc[367,'res4'] = 0
data.loc[367,'n_cameras'] = 3
data.loc[376,'res4'] = 2
data.loc[376,'n_cameras'] = 4

  data['price'] = data['price'].str.replace('$','')


In [116]:
# converting all columns to proper dtype

data['storage'] = pd.to_numeric(data['storage'])

data['ram'] = pd.to_numeric(data['ram'])

data['screen_size'] = pd.to_numeric(data['screen_size'])

data['price'] = pd.to_numeric(data['price'])

data['res1'] = pd.to_numeric(data['res1'])

data['res2'] = pd.to_numeric(data['res2'])

data['res3'] = pd.to_numeric(data['res3'])

data['res4'] = pd.to_numeric(data['res4'])

In [117]:
for feature in data:
  print(f'Type of {feature}: {data[feature].dtype}')

Type of brand: object
Type of model: object
Type of storage: int64
Type of ram: int64
Type of screen_size: float64
Type of battery: int64
Type of price: float64
Type of n_cameras: int64
Type of res1: float64
Type of res2: float64
Type of res3: int64
Type of res4: float64


In [118]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407 entries, 0 to 406
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   brand        407 non-null    object 
 1   model        407 non-null    object 
 2   storage      407 non-null    int64  
 3   ram          407 non-null    int64  
 4   screen_size  407 non-null    float64
 5   battery      407 non-null    int64  
 6   price        407 non-null    float64
 7   n_cameras    407 non-null    int64  
 8   res1         407 non-null    float64
 9   res2         407 non-null    float64
 10  res3         407 non-null    int64  
 11  res4         407 non-null    float64
dtypes: float64(5), int64(5), object(2)
memory usage: 38.3+ KB


In [120]:
# checking for duplicated data in the dataset

absolute = data.duplicated().sum() 

relative = data.duplicated().mean()*100

print(f'Duplicated:\n\ndata(#): {absolute}\n\nRelative: {round(relative,2)} %')

Duplicated:

data(#): 49

Relative: 12.04 %


In [121]:
# removing duplicated data from the dataset

dup = data[data.duplicated()]

data_without_dup = data.drop_duplicates()

data_without_dup.reset_index()

Unnamed: 0,index,brand,model,storage,ram,screen_size,battery,price,n_cameras,res1,res2,res3,res4
0,0,apple,iphone 13 pro,128,6,6.10,3095,999.0,3,12.0,12.0,12,0.0
1,1,samsung,galaxy s21 ultra,256,12,6.80,5000,1199.0,4,108.0,10.0,10,12.0
2,2,oneplus,9 pro,128,8,6.70,4500,899.0,4,48.0,50.0,8,2.0
3,3,xiaomi,redmi note 10 pro,128,6,6.67,5020,279.0,4,64.0,8.0,5,2.0
4,4,google,pixel 6,128,8,6.40,4614,799.0,2,50.0,12.2,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,401,huawei,p30 lite,128,4,6.15,3340,329.0,3,48.0,8.0,2,0.0
354,402,samsung,galaxy note20 5g,128,8,6.70,4300,1049.0,3,12.0,64.0,12,0.0
355,403,xiaomi,mi 10 lite 5g,128,6,6.57,4160,349.0,4,48.0,8.0,2,2.0
356,404,apple,iphone 12 pro max,128,6,6.70,3687,1099.0,3,12.0,12.0,12,0.0


In [122]:
le = LabelEncoder()
data['brand']= le.fit_transform(data['brand'])

data.head()

Unnamed: 0,brand,model,storage,ram,screen_size,battery,price,n_cameras,res1,res2,res3,res4
0,0,iphone 13 pro,128,6,6.1,3095,999.0,3,12.0,12.0,12,0.0
1,12,galaxy s21 ultra,256,12,6.8,5000,1199.0,4,108.0,10.0,10,12.0
2,9,9 pro,128,8,6.7,4500,899.0,4,48.0,50.0,8,2.0
3,15,redmi note 10 pro,128,6,6.67,5020,279.0,4,64.0,8.0,5,2.0
4,4,pixel 6,128,8,6.4,4614,799.0,2,50.0,12.2,0,0.0


In [123]:
data['model'] = le.fit_transform(data['model'])

data.head()

Unnamed: 0,brand,model,storage,ram,screen_size,battery,price,n_cameras,res1,res2,res3,res4
0,0,113,128,6,6.1,3095,999.0,3,12.0,12.0,12,0.0
1,12,98,256,12,6.8,5000,1199.0,4,108.0,10.0,10,12.0
2,9,19,128,8,6.7,4500,899.0,4,48.0,50.0,8,2.0
3,15,191,128,6,6.67,5020,279.0,4,64.0,8.0,5,2.0
4,4,176,128,8,6.4,4614,799.0,2,50.0,12.2,0,0.0


In [124]:
x = data.drop(columns='price')
y = data['price']

In [130]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

model = RandomForestRegressor()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error: ", mse)
print("Mean Absolute Error: ", mae)

Mean Squared Error:  18231.234219737747
Mean Absolute Error:  60.299643606623725


In [131]:
df_compare = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_compare

Unnamed: 0,Actual,Predicted
70,169.0,155.288810
218,139.0,157.939394
258,349.0,399.741667
33,259.0,233.300000
42,189.0,196.215000
...,...,...
90,399.0,460.300000
379,449.0,391.400000
341,399.0,276.566667
376,999.0,1036.500000
