# Importing Libraries and data

In [49]:
import re

import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, OrdinalEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import xgboost as xgb

In [2]:
! unzip 'samsung_mobile_data.zip'

Archive:  samsung_mobile_data.zip
replace samsungMobilesData.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: samsungMobilesData.csv  
replace samsung_mobile_new_data.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: samsung_mobile_new_data.csv  


In [3]:
df = pd.read_csv('samsungMobilesData.csv')

# Taking a peek at the data

In [4]:
df.sample(10)

Unnamed: 0,name,ratings,price,imgURL,camera,display,battery,storage,ram,processor,android_version
236,"SAMSUNG Galaxy On6 (Black, 64 GB)",4.4,17600,https://rukminim2.flixcart.com/image/312/312/j...,13,AMOLED,4000,64,6,not mentioned,9
400,"SAMSUNG Galaxy A10s (Black, 32 GB)",4.3,10000,https://rukminim2.flixcart.com/image/312/312/k...,13,other_types,4000,32,2,MediaTek,9
348,"SAMSUNG Galaxy A70s (Prism Crush White, 128 GB)",4.2,31000,https://rukminim2.flixcart.com/image/312/312/k...,64,other_types,4500,128,6,Snapdragon(SDM450-F01),10
246,"SAMSUNG Galaxy M32 Prime Edition (Black, 64 GB)",4.2,12185,https://rukminim2.flixcart.com/image/312/312/x...,64,LCD,5000,32,3,not mentioned,11
67,"SAMSUNG Galaxy A52 (Awesome Blue, 128 GB)",4.3,25980,https://rukminim2.flixcart.com/image/312/312/k...,64,HD+ AMOLED,4500,128,6,not mentioned,12
386,"SAMSUNG Galaxy Note 8 (Midnight Black, 64 GB)",4.1,8464,https://rukminim2.flixcart.com/image/312/312/j...,12,other_types,3300,64,6,not mentioned,10
273,"SAMSUNG Galaxy A51 (Prism Crush Blue, 128 GB)",4.2,27999,https://rukminim2.flixcart.com/image/312/312/k...,13,AMOLED,4000,128,8,Snapdragon450,5
133,"SAMSUNG Galaxy A13 (Black, 128 GB)",4.1,14999,https://rukminim2.flixcart.com/image/312/312/l...,50,other_types,5000,128,4,not mentioned,12
277,"SAMSUNG M21 2021 Edition (Charcoal Black, 64 GB)",4.2,11530,https://rukminim2.flixcart.com/image/312/312/k...,48,other_types,6000,64,4,not mentioned,10
12,"SAMSUNG Galaxy F13 (Waterfall Blue, 64 GB)",4.3,9499,https://rukminim2.flixcart.com/image/312/312/x...,50,other_types,6000,64,4,not mentioned,12


In [5]:
df.shape

(407, 11)

In [6]:
df.isna().sum()

name               0
ratings            0
price              0
imgURL             0
camera             0
display            0
battery            0
storage            0
ram                0
processor          0
android_version    0
dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 407 entries, 0 to 406
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   name             407 non-null    object 
 1   ratings          407 non-null    float64
 2   price            407 non-null    int64  
 3   imgURL           407 non-null    object 
 4   camera           407 non-null    int64  
 5   display          407 non-null    object 
 6   battery          407 non-null    int64  
 7   storage          407 non-null    int64  
 8   ram              407 non-null    int64  
 9   processor        407 non-null    object 
 10  android_version  407 non-null    int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 35.1+ KB


# Final Cleaning of the data

In [8]:
df.drop('imgURL', inplace = True, axis = 1)

In [9]:
def get_color(row):
  matches = re.search(r'\((.*?)\)', row)  # Extracts text within parentheses
  if matches:
    x = matches.group(1).split(',')[0]
    return x.lower().split(' ')[-1]
  return None

df['color'] = df['name'].apply(get_color)

In [10]:
df.color.unique()

array(['copper', 'blue', 'green', 'purple', 'blush', 'black', 'brown',
       'mint', 'graphite', 'lavender', 'violet', 'white', 'navy', 'peach',
       'onyx', 'red', 'cream', 'orange', 'silver', 'bronze', 'lime',
       'aqua', 'teal', 'gold', 'burgundy', 'graygreen', 'gray', 'grey',
       'sand'], dtype=object)

In [11]:
df.sample(10)

Unnamed: 0,name,ratings,price,camera,display,battery,storage,ram,processor,android_version,color
397,"SAMSUNG Galaxy M31s (Mirage Blue, 128 GB)",4.1,18999,64,other_types,6000,128,6,not mentioned,9,blue
54,"SAMSUNG Galaxy A14 5G (Black, 64 GB)",4.1,15999,50,LCD,5000,64,4,not mentioned,12,black
380,"SAMSUNG Galaxy A70 (Black, 128 GB)",4.2,30900,32,other_types,4500,128,6,not mentioned,13,black
17,"SAMSUNG Galaxy M33 5G (Mystique Green, 128 GB)",4.2,16479,50,other_types,6000,128,6,not mentioned,13,green
22,"SAMSUNG Galaxy M33 5G (Mystique Green, 128 GB)",4.2,16479,50,other_types,6000,128,6,not mentioned,1,green
327,"SAMSUNG Galaxy A71 (Haze Crush Silver, 128 GB)",4.3,29499,48,AMOLED,4500,128,8,not mentioned,12,silver
158,"SAMSUNG Galaxy A23 (Black, 128 GB)",4.3,18790,50,other_types,5000,128,8,not mentioned,12,black
9,"SAMSUNG Galaxy F23 5G (Copper Blush, 128 GB)",4.3,15499,50,other_types,5000,128,6,Snapdragon750GProcessor,12,blush
293,"SAMSUNG Galaxy A7-2017 (Gold Sand, 32 GB)",4.3,25900,64,other_types,3600,32,3,not mentioned,12,sand
398,"SAMSUNG Galaxy A32 (Awesome Black, 128 GB)",4.5,24999,64,other_types,5000,128,6,not mentioned,9,black


In [12]:
def get_series(row):
  pattern = r'^(?=[a-zA-Z\d]{1,5}$)(?=[a-zA-Z])(?=.*\d).*$'

  for string in row.split(' '):
    if re.match(pattern, string):
      return string

df['series'] = df['name'].apply(get_series)

In [14]:
df.dropna(axis = 0, inplace =True)

In [15]:
df.shape

(393, 12)

In [16]:
df.sample(3)

Unnamed: 0,name,ratings,price,camera,display,battery,storage,ram,processor,android_version,color,series
100,"SAMSUNG Galaxy A53 (Awesome White, 128 GB)",4.0,31999,64,HD+ AMOLED,5000,128,6,not mentioned,12,white,A53
154,"SAMSUNG Galaxy A22 (Black, 128 GB)",4.2,20499,48,AMOLED,5000,128,6,Heilo806769,11,black,A22
246,"SAMSUNG Galaxy M32 Prime Edition (Black, 64 GB)",4.2,12185,64,LCD,5000,32,3,not mentioned,11,black,M32


In [17]:
df.camera.unique()

array([ 50,  13,   0,  48,  32,  12,   8,  64, 108,  25, 200,  24,   5,
        16,   3])

In [18]:
print(df.display.value_counts())

other_types    239
LCD             69
HD+ AMOLED      42
AMOLED          38
TFT              5
Name: display, dtype: int64


In [19]:
for i in df.columns[4:]:
  print(f'{i.title()} value_counts = \n{df[i].value_counts()}')
  print('------------------------------------------------------')

Display value_counts = 
other_types    239
LCD             69
HD+ AMOLED      42
AMOLED          38
TFT              5
Name: display, dtype: int64
------------------------------------------------------
Battery value_counts = 
5000    192
6000     95
4000     27
4500     25
3700     11
3900      8
7000      7
3000      7
3300      7
4400      4
2600      4
3500      4
4700      2
Name: battery, dtype: int64
------------------------------------------------------
Storage value_counts = 
128    216
64     101
32      40
256     25
16       6
1        3
8        1
512      1
Name: storage, dtype: int64
------------------------------------------------------
Ram value_counts = 
6     126
4     122
8      82
3      28
2      18
12     13
1       4
Name: ram, dtype: int64
------------------------------------------------------
Processor value_counts = 
not mentioned              262
MediaTek                    14
Gen8                        14
Dimensity1080,              12
Dimensity700         

In [20]:
df.drop(['name','ratings'], inplace = True, axis = 1)

In [21]:
df.sample(3)

Unnamed: 0,price,camera,display,battery,storage,ram,processor,android_version,color,series
284,19000,64,other_types,6000,128,8,not mentioned,11,black,M32
73,21999,50,LCD,5000,128,6,Snapdragon695,12,silver,A23
260,14499,5,other_types,5000,128,4,not mentioned,11,white,A13


# First Approach

## Encoding

In [22]:
X = df.drop('price', axis = 1)
y = df['price'].copy()

In [23]:
encoder = OrdinalEncoder()

In [55]:
cat_pipeline = Pipeline([
    ('encoder', OrdinalEncoder()),
    ('std_scaler', StandardScaler())
])

num_pipeline = Pipeline([
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('cat_pipeline', cat_pipeline, [1,5,7,8]),
    ('num_pipeline', num_pipeline, [0,2,3,4,6])
])

In [57]:
X_prepared = full_pipeline.fit_transform(X)

## Splitting the data

In [58]:
X_train, X_test, y_train, y_test = train_test_split(X_prepared ,y ,test_size = 0.2, random_state = 42)

In [59]:
X_train.shape, X_test.shape

((314, 9), (79, 9))

## Defining the model and getting predictions

In [60]:
r_forest = RandomForestRegressor(random_state=42)
d_tree = DecisionTreeRegressor(random_state=42)
xgb_regressor = xgb.XGBRegressor(random_state = 42)

In [61]:
def get_metrics(y_true, y_pred):
  mse = mean_squared_error(y_true, y_pred)
  mae = mean_absolute_error(y_true, y_pred)
  acc_score = r2_score(y_true, y_pred)

  return {
      'mse': mse,
      'mae': mae,
      'rmse': np.sqrt(mse),
      'r2_score': acc_score
  }

In [62]:
r_forest.fit(X_train, y_train)

In [63]:
r_forest_pred = r_forest.predict(X_test)
get_metrics(y_test, r_forest_pred)

{'mse': 9379291.268285716,
 'mae': 2184.9984409282697,
 'rmse': 3062.5628594831674,
 'r2_score': 0.8113503375986281}

In [64]:
d_tree.fit(X_train, y_train)

In [65]:
d_tree_pred = d_tree.predict(X_test)
get_metrics(y_test, d_tree_pred)

{'mse': 16618557.215189874,
 'mae': 2278.0506329113923,
 'rmse': 4076.5864660509624,
 'r2_score': 0.665743911926038}

In [66]:
xgb_regressor.fit(X_train, y_train)

In [69]:
boost_pred = xgb_regressor.predict(X_test)
get_metrics(y_test, boost_pred)

{'mse': 7931565.800042563,
 'mae': 1885.5711283623418,
 'rmse': 2816.30357029255,
 'r2_score': 0.840469053823746}

In [68]:
import pickle

pickle.dump(xgb_regressor, open('best_samsung_price_prediction_model.pkl', 'wb'))