In [1]:
import numpy as np
import pandas as pd
import os
        
TRAIN_DATASET_PATH = "/kaggle/input/mobile-prices-2023/mobile_prices_2023.csv"
dataset = pd.read_csv(TRAIN_DATASET_PATH)

print("dataset size: ", dataset.shape)
dataset.head()

dataset size:  (1836, 11)


Unnamed: 0,Phone Name,Rating ?/5,Number of Ratings,RAM,ROM/Storage,Back/Rare Camera,Front Camera,Battery,Processor,Price in INR,Date of Scraping
0,"POCO C50 (Royal Blue, 32 GB)",4.2,33561,2 GB RAM,32 GB ROM,8MP Dual Camera,5MP Front Camera,5000 mAh,"Mediatek Helio A22 Processor, Upto 2.0 GHz Pro...","₹5,649",2023-06-17
1,"POCO M4 5G (Cool Blue, 64 GB)",4.2,77128,4 GB RAM,64 GB ROM,50MP + 2MP,8MP Front Camera,5000 mAh,Mediatek Dimensity 700 Processor,"₹11,999",2023-06-17
2,"POCO C51 (Royal Blue, 64 GB)",4.3,15175,4 GB RAM,64 GB ROM,8MP Dual Rear Camera,5MP Front Camera,5000 mAh,Helio G36 Processor,"₹6,999",2023-06-17
3,"POCO C55 (Cool Blue, 64 GB)",4.2,22621,4 GB RAM,64 GB ROM,50MP Dual Rear Camera,5MP Front Camera,5000 mAh,Mediatek Helio G85 Processor,"₹7,749",2023-06-17
4,"POCO C51 (Power Black, 64 GB)",4.3,15175,4 GB RAM,64 GB ROM,8MP Dual Rear Camera,5MP Front Camera,5000 mAh,Helio G36 Processor,"₹6,999",2023-06-17


In [2]:
dataset.dtypes

Phone Name            object
Rating ?/5           float64
Number of Ratings     object
RAM                   object
ROM/Storage           object
Back/Rare Camera      object
Front Camera          object
Battery               object
Processor             object
Price in INR          object
Date of Scraping      object
dtype: object

In [3]:
print('Date of Scraping unqiue values:', dataset['Date of Scraping'].unique())
print('Battery unqiue units of measurement:', dataset['Battery'].str.slice(-3).unique())

Date of Scraping unqiue values: ['2023-06-17']
Battery unqiue units of measurement: ['mAh' nan]


In [4]:
print(len(dataset['Back/Rare Camera'].unique()))
print(len(dataset['Front Camera'].unique()))
print(len(dataset['Processor'].unique()))

175
34
333


In [5]:
dataset['Front Camera'] = dataset['Front Camera'].str.extract('(\d+)')

In [6]:
# Перевод ROM/Storage в KB
dataset['ROM/Storage'] = dataset['ROM/Storage'].str.replace('ROM', '')
dataset['ROM/Storage'] = dataset['ROM/Storage'].where(dataset['ROM/Storage'].str.match('\d+\s(GB|MB|KB)'), np.nan).str.strip()
dataset['ROM/Storage value'] = dataset['ROM/Storage'].str.extract('(\d+)')
dataset['ROM/Storage union'] = dataset['ROM/Storage'].str.extract('([A-Z]+)')
dataset.loc[dataset['ROM/Storage union'] == 'GB', 'ROM/Storage value'] = dataset.loc[dataset['ROM/Storage union'] == 'GB', 'ROM/Storage value'].astype(int) * 1024 * 1024
dataset.loc[dataset['ROM/Storage union'] == 'MB', 'ROM/Storage value'] = dataset.loc[dataset['ROM/Storage union'] == 'MB', 'ROM/Storage value'].astype(int) * 1024
dataset.loc[dataset['ROM/Storage union'] == 'KB', 'ROM/Storage value'] = dataset.loc[dataset['ROM/Storage union'] == 'KB', 'ROM/Storage value'].astype(int)
dataset['ROM/Storage'] = dataset['ROM/Storage value']
dataset = dataset.drop(['ROM/Storage union', 'ROM/Storage value'], axis=1)

In [7]:
# Перевод RAM в MB
dataset['RAM'] = dataset['RAM'].str.replace('ROM', '')
dataset['RAM'] = dataset['RAM'].str.replace('RAM', '')
dataset['RAM'] = dataset['RAM'].where(dataset['RAM'].str.match('\d+\s(GB|MB|TB)'), np.nan).str.strip()
dataset['RAM value'] = dataset['RAM'].str.extract('(\d+)')
dataset['RAM union'] = dataset['RAM'].str.extract('([A-Z]+)')
dataset.loc[dataset['RAM union'] == 'TB', 'RAM value'] = dataset.loc[dataset['RAM union'] == 'TB', 'RAM value'].astype(int) * 1024 * 1024
dataset.loc[dataset['RAM union'] == 'GB', 'RAM value'] = dataset.loc[dataset['RAM union'] == 'GB', 'RAM value'].astype(int) * 1024
dataset.loc[dataset['RAM union'] == 'MB', 'RAM value'] = dataset.loc[dataset['RAM union'] == 'MB', 'RAM value'].astype(int)
dataset['RAM'] = dataset['RAM value']
dataset = dataset.drop(['RAM union', 'RAM value'], axis=1)

In [8]:
dataset['Back/Rare Camera'] = dataset['Back/Rare Camera'].str.extract('(\d+)MP')

In [9]:
dataset['Battery'] = dataset['Battery'].str.extract('(\d+)')
dataset['Price in INR'] = dataset['Price in INR'].str.slice(1).str.replace(',', '').astype(int)
dataset['Number of Ratings'] = dataset['Number of Ratings'].str.replace(',', '').astype(int)

In [10]:
drop_cols = [
    'Phone Name',
    'Date of Scraping',
    'Processor',
]

dataset = dataset.drop(drop_cols, axis=1)

In [11]:
nan_cols = dataset.columns[dataset.isna().any()].tolist()
print("Nan cols:", nan_cols)

Nan cols: ['RAM', 'ROM/Storage', 'Back/Rare Camera', 'Front Camera', 'Battery']


In [12]:
from sklearn.impute import SimpleImputer

mod_cols = [
    'Battery',
    'ROM/Storage',
    'RAM'
]
mode_imputer = SimpleImputer(strategy='most_frequent')
dataset[mod_cols] = mode_imputer.fit_transform(dataset[mod_cols])

zero_cols = [
    'Back/Rare Camera',
    'Front Camera'
]
dataset[zero_cols] = dataset[zero_cols].fillna(0)

dataset['Battery'] = dataset['Battery'].astype(int)
dataset['Front Camera'] = dataset['Front Camera'].astype(int)
dataset['Back/Rare Camera'] = dataset['Back/Rare Camera'].astype(int)
dataset['ROM/Storage'] = dataset['ROM/Storage'].astype(int)
dataset['RAM'] = dataset['RAM'].astype(int)
dataset['Number of Ratings'] = dataset['Number of Ratings'].astype('category').cat.codes

In [13]:
dataset.head()

Unnamed: 0,Rating ?/5,Number of Ratings,RAM,ROM/Storage,Back/Rare Camera,Front Camera,Battery,Price in INR
0,4.2,535,2048,33554432,8,5,5000,5649
1,4.2,614,4096,67108864,50,8,5000,11999
2,4.3,442,4096,67108864,8,5,5000,6999
3,4.2,491,4096,67108864,50,5,5000,7749
4,4.3,442,4096,67108864,8,5,5000,6999


In [14]:
dataset.dtypes

Rating ?/5           float64
Number of Ratings      int16
RAM                    int64
ROM/Storage            int64
Back/Rare Camera       int64
Front Camera           int64
Battery                int64
Price in INR           int64
dtype: object

In [15]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

Y = dataset['Price in INR']
X = dataset.drop('Price in INR', axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.5)

cls = make_pipeline(
    MinMaxScaler(),
    LinearRegression()
)

cls = cls.fit(X_train.to_numpy(), Y_train)
y_train_preds = cls.predict(X_train.to_numpy())

y_test_preds = cls.predict(X_test.to_numpy())

In [16]:
train_predict_result = pd.DataFrame({
    'pred': Y_train.to_numpy(),
    'real': y_train_preds.astype(int)
})

train_predict_result.head()

Unnamed: 0,pred,real
0,8999,6021
1,15999,20932
2,11999,16037
3,10499,7320
4,7999,16035


In [17]:
test_predict_result = pd.DataFrame({
    'pred': Y_test.to_numpy(),
    'real': y_test_preds.astype(int)
})

test_predict_result.head()

Unnamed: 0,pred,real
0,44990,29150
1,1549,10182
2,2499,22119
3,64990,44803
4,33999,37281


In [18]:
from sklearn.metrics import mean_absolute_error, r2_score

print('Train dataset:')

print(f'MAE: {mean_absolute_error(y_train_preds, Y_train)}')
print(f'R^2: {r2_score(y_train_preds, Y_train)}')

Train dataset:
MAE: 7239.4354479013555
R^2: 0.6612963186905347


In [19]:
print('Test dataset:')

print(f'MAE: {mean_absolute_error(y_test_preds, Y_test)}')
print(f'R^2: {r2_score(y_test_preds, Y_test)}')

Test dataset:
MAE: 8320.259301722697
R^2: 0.5642745858620253
