## LIBS

In [1]:
import pandas as pd
import os
import numpy as np
import yaml
import re
import random
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
random.seed(42) 
#42 is the answer to everything !!!

In [2]:
config = yaml.safe_load(open("config.yaml"))

In [3]:
dfs = []

for file in config['files']:
    df_sub = pd.read_csv(os.path.join(config['SavePath'], file + ".csv"))
    dfs.append(df_sub)

df = pd.concat(dfs)

In [4]:
df.shape

(5999, 52)

## TARGET VARIABLE 

In [5]:
def convert_to_euro(price):
    price = price.replace("\u2009", "").replace("\t", "").replace(",", "")
    if pd.isnull(price):
        return np.nan
    if price.startswith("About"):
        price = float(price.split(" ")[1]) + round(random.uniform(0, 9), 2)
    else:
        euro_tag = [p for p in price.split("/") if "€" in p]
        if len(euro_tag) > 0:
            price = float(euro_tag[0].split("€")[1])
        else:
            currency_tag = []
            for c in config['conversion_rates'].keys():
                if c in price:
                    currency_tag.append(c)
            if currency_tag:
                number = float(re.findall("\d+", price.split(currency_tag[0])[1])[0])
                price = config['conversion_rates'][currency_tag[0]] * number 
    return price

In [6]:
df = df[~df['MISC_Price'].str.contains('BTC', na=False)]
df = df.dropna(subset=['MISC_Price'])
df = df.reset_index(drop=True)

In [7]:
df['MISC_Price_Euro'] = df['MISC_Price'].apply(convert_to_euro)

In [8]:
df[['Name','MISC_Price', 'MISC_Price_Euro']].head(10)

Unnamed: 0,Name,MISC_Price,MISC_Price_Euro
0,Acer Chromebook Tab 10,About 330 EUR,335.75
1,Acer Iconia Talk S,About 170 EUR,170.23
2,Acer Liquid Z6 Plus,About 250 EUR,252.48
3,Acer Liquid Z6,About 120 EUR,122.01
4,Acer Iconia Tab 10 A3-A40,About 230 EUR,236.63
5,Acer Liquid X2,About 230 EUR,236.09
6,Acer Liquid Zest Plus,About 200 EUR,208.03
7,Acer Liquid Zest,About 110 EUR,110.78
8,Acer Predator 8,About 350 EUR,353.8
9,Acer Liquid Jade Primo,About 220 EUR,220.27


## Other variables

In [9]:
df.info(
    null_counts=True
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4157 entries, 0 to 4156
Data columns (total 53 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Brand                  4157 non-null   object 
 1   url                    4157 non-null   object 
 2   imgUrl                 4157 non-null   object 
 3   Name                   4157 non-null   object 
 4   NETWORK_Technology     4157 non-null   object 
 5   NETWORK_2G_bands       4157 non-null   object 
 6   NETWORK_3G_bands       3369 non-null   object 
 7   NETWORK_4G_bands       2157 non-null   object 
 8   NETWORK_5G_bands       303 non-null    object 
 9   NETWORK_GPRS           960 non-null    object 
 10  NETWORK_EDGE           954 non-null    object 
 11  NETWORK_Speed          3353 non-null   object 
 12  LAUNCH_Announced       4156 non-null   object 
 13  LAUNCH_Status          4157 non-null   object 
 14  BODY_Dimensions        4154 non-null   object 
 15  BODY

  df.info(


In [10]:
df.drop(['MISC_Price'], axis=1, inplace=True)

### Brand, url and Name

In [11]:
# #label encoding for Brand, drop url, imgUrl, Name
# from sklearn.preprocessing import LabelEncoder
# enc_brand = LabelEncoder()
# df['Brand'] = enc_brand.fit_transform(df['Brand'])
# df = df.drop(['url', 'imgUrl', 'Name'], axis=1)

#### Người ta bảo brand hay bị giá ảo -> test liền target encoding


In [12]:
from sklearn.preprocessing import TargetEncoder
enc_brand = TargetEncoder()
df['Brand'] = enc_brand.fit_transform(np.array(df['Brand']).reshape(-1, 1), df['MISC_Price_Euro'])
df = df.drop(['url', 'imgUrl', 'Name'], axis=1)

### Network_2345G

In [13]:
df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']].head(10)

Unnamed: 0,NETWORK_2G_bands,NETWORK_3G_bands,NETWORK_4G_bands,NETWORK_5G_bands
0,,,,
1,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2,HSDPA 850 / 1900 / 2100,"1, 3, 7, 8, 20",
2,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA,LTE (unspecified),
3,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA,LTE (unspecified),
4,,,,
5,GSM 850 / 900 / 1800 / 1900,"HSDPA 900 / 1900 / 2100 - Europe, Taiwan",LTE 800 / 1800 / 2100 / 2600 - Europe,
6,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA,LTE (unspecified),
7,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2 (dual-SIM model only),HSDPA 850 / 900 / 1900 / 2100,LTE (unspecified),
8,,,,
9,GSM 850 / 900 / 1800 / 1900 - SIM 1 & SIM 2,HSDPA 850 / 900 / 1900 / 2100 - Europe/ Taiwan,LTE 800 / 1800 / 2100 / 2600 - Europe,


In [14]:
print("2G number of unique values: ", df['NETWORK_2G_bands'].nunique())
print("3G number of unique values: ", df['NETWORK_3G_bands'].nunique())
print("4G number of unique values: ", df['NETWORK_4G_bands'].nunique())
print("5G number of unique values: ", df['NETWORK_5G_bands'].nunique())

2G number of unique values:  100
3G number of unique values:  337
4G number of unique values:  795
5G number of unique values:  171


#### Target Encoding 

In [15]:
from category_encoders import TargetEncoder
enc_network = TargetEncoder()
df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']] = enc_network.fit_transform(df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']], df['MISC_Price_Euro'])

In [16]:
df[['NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands']].head(10)

Unnamed: 0,NETWORK_2G_bands,NETWORK_3G_bands,NETWORK_4G_bands,NETWORK_5G_bands
0,337.6165,143.225901,148.766615,283.438847
1,420.931529,198.865696,170.390531,283.438847
2,263.010912,577.429776,615.013681,283.438847
3,263.010912,577.429776,615.013681,283.438847
4,337.6165,143.225901,148.766615,283.438847
5,211.937451,293.919541,292.016863,283.438847
6,263.010912,577.429776,615.013681,283.438847
7,263.010912,369.351015,615.013681,283.438847
8,337.6165,143.225901,148.766615,283.438847
9,420.931529,291.861225,292.016863,283.438847


### Network GPRS EDGE SPEED

In [17]:
df[['NETWORK_GPRS','NETWORK_EDGE']].head(10)

Unnamed: 0,NETWORK_GPRS,NETWORK_EDGE
0,No,No
1,,
2,Yes,Yes
3,Yes,Yes
4,No,No
5,,
6,Yes,Yes
7,,
8,No,No
9,,


In [18]:
df[['NETWORK_GPRS','NETWORK_EDGE']].nunique()

NETWORK_GPRS    6
NETWORK_EDGE    4
dtype: int64

In [19]:
#using same label encoding for GPRS and EDGE
from sklearn.preprocessing import LabelEncoder
enc_gprs = LabelEncoder()
df['NETWORK_GPRS'] = enc_gprs.fit_transform(df['NETWORK_GPRS'])
df['NETWORK_EDGE'] = enc_gprs.fit_transform(df['NETWORK_EDGE'])

In [20]:
df[['NETWORK_Speed', 'NETWORK_Technology']].head(10)

Unnamed: 0,NETWORK_Speed,NETWORK_Technology
0,,No cellular connectivity
1,"HSPA 42.2/11.5 Mbps, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
2,"HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
3,"HSPA, LTE",GSM / HSPA / LTE
4,,No cellular connectivity
5,"HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
6,"HSPA, LTE",GSM / HSPA / LTE
7,"HSPA, LTE Cat4 150/50 Mbps",GSM / HSPA / LTE
8,,No cellular connectivity
9,"HSPA 42.2/5.76 Mbps, LTE-A (2CA) Cat6 300/50 Mbps",GSM / HSPA / LTE


In [21]:
df['NETWORK_Technology'].unique()

array(['No cellular connectivity', 'GSM / HSPA / LTE', 'GSM / HSPA',
       'GSM', 'GSM / UMTS', 'GSM / CDMA / HSPA / LTE', 'LTE',
       'CDMA / EVDO', 'GSM / HSPA / EVDO / LTE',
       'GSM / CDMA / HSPA / EVDO / LTE / 5G', 'GSM / HSPA / LTE / 5G',
       'GSM / CDMA / HSPA / EVDO / LTE', 'GSM / CDMA / HSPA / EVDO',
       'GSM / CDMA / HSPA / LTE / 5G', 'GSM / LTE', 'HSPA / LTE',
       'GSM / CDMA / EVDO', 'GSM / CDMA / HSPA / CDMA2000 / LTE / 5G',
       'GSM / CDMA / HSPA / CDMA2000 / LTE', 'GSM / CDMA',
       'GSM / HSPA / EVDO / LTE / 5G', 'LTE / 5G',
       'GSM / CDMA / EVDO / LTE', 'GSM / CDMA / HSPA',
       'CDMA / EVDO / LTE', 'CDMA / HSPA / EVDO / LTE',
       'GSM / UMTS / HSPA', 'GSM / HSPA / EVDO', 'CDMA / LTE',
       'CDMA / CDMA2000', 'CDMA', 'HSPA / EVDO', 'CDMA / HSPA',
       'CDMA / HSPA / EVDO', 'GSM / CDMA2000'], dtype=object)

In [22]:
df['NETWORK_Speed'].unique()

array([nan, 'HSPA 42.2/11.5 Mbps, LTE Cat4 150/50 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps', 'HSPA, LTE',
       'HSPA, LTE Cat4 150/50 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE-A (2CA) Cat6 300/50 Mbps',
       'HSPA 42.2/5.76 Mbps', 'HSPA 21.1/5.76 Mbps, LTE Cat4 150/50 Mbps',
       'HSPA', 'HSPA 21.1/5.76 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE-A Cat4 150/50 Mbps',
       'HSPA 7.2/5.76 Mbps', 'HSPA 7.2/0.384 Mbps', 'HSPA 14.4/5.76 Mbps',
       'HSPA 7.2/2 Mbps', 'TD-SCDMA', 'HSPA 14.4/2 Mbps',
       'HSPA 3.6/0.384 Mbps', 'HSPA 42.2/11.1 Mbps, LTE Cat4 150/50 Mbps',
       'LTE', 'No', 'HSPA 42.2/11.5 Mbps, LTE-A (2CA) Cat6 300/50 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE Cat4 150/50 Mbps or LTE-A (2CA) Cat6 300/50 Mbps',
       'HSPA 21.1/5.76 Mbps, LTE', 'HSPA 42.2/11.5 Mbps',
       'HSPA 42.2/5.76 Mbps, LTE Cat3 100/50 Mbps',
       'HSPA 21.1/5.76 Mbps, LTE Cat3 100/50 Mbps',
       'EV-DO Rev.A 3.1 Mbps',
       'HSPA 42.2/11.5 Mbps, LTE Cat7 300/100 Mbps',
   

2 thằng sussy này có vẻ ảnh hưởng tới giá nhìu đó mấy ní => target encoding thui

In [23]:
#target encoding for NETWORK_Technology and NETWORK_Speed
enc_network = TargetEncoder()
df[['NETWORK_Technology']] = enc_network.fit_transform(df[['NETWORK_Technology']], df['MISC_Price_Euro'])
df[['NETWORK_Speed']] = enc_network.fit_transform(df[['NETWORK_Speed']], df['MISC_Price_Euro'])

### Launch annouced với status ko liên quan lắm (có liên quan thì có lẽ là kiểu dựa vào đó rồi train riêng theo từng thời kì, nhưng mà mình chưa biết làm thế nào để xử lý nên thôi)

### Body dim, weight, build

In [24]:
df[['BODY_Dimensions','BODY_Weight', 'BODY_Build',]].head(10)

Unnamed: 0,BODY_Dimensions,BODY_Weight,BODY_Build
0,238.3 x 172.2 x 9.9 mm (9.38 x 6.78 x 0.39 in),544.3 g (1.20 lb),
1,191.7 x 101 x 9.4 mm (7.55 x 3.98 x 0.37 in),260 g (9.17 oz),
2,153.8 x 75.6 x 8.5 mm (6.06 x 2.98 x 0.33 in),169 g (5.96 oz),
3,145.5 x 72.5 x 8.5 mm (5.73 x 2.85 x 0.33 in),126 g (4.44 oz),
4,259 x 167 x 8.9 mm (10.20 x 6.57 x 0.35 in),-,
5,153.3 x 78.8 x 8.5 mm (6.04 x 3.10 x 0.33 in),166 g (5.86 oz),
6,154 x 77 x 10 mm (6.06 x 3.03 x 0.39 in),-,
7,145.7 x 71.2 x 8.4 mm (5.74 x 2.80 x 0.33 in),125 g (4.41 oz),
8,217.9 x 127 x 8.6 mm (8.58 x 5.0 x 0.34 in),353.8 g (12.49 oz),
9,156.5 x 75.9 x 8.4 mm (6.16 x 2.99 x 0.33 in),150 g (5.29 oz),


In [25]:
df['BODY_Build'].unique()

array([nan, 'Glass front, plastic back, plastic frame',
       'Glass front, plastic frame, plastic back',
       'Glass front (DragonTrail Pro glass), plastic back, plastic frame',
       'Glass front (Asahi Dragontrail), glass back (Asahi Dragontrail), plastic frame',
       'Glass front (Gorilla Glass 4), glass back (Gorilla Glass 4), aluminum frame',
       'Glass front (Corning-made glass), glass back (Corning-made glass), titanium frame (grade 5)',
       'Glass front (Corning-made glass), glass back (Corning-made glass), aluminum frame',
       'Sapphire crystal front, ceramic/sapphire crystal back, titanium frame',
       'Glass front, ceramic/sapphire crystal back, stainless steel frame',
       'Glass front, ceramic/sapphire crystal back, aluminum frame',
       'Glass front, aluminum back, aluminum frame',
       'Glass front (Corning-made glass), glass back (Corning-made glass), stainless steel frame',
       'Glass front, plastic/sapphire crystal back, aluminum frame',
   

In [26]:
# #return indexes of rows with foldable body
# contains_folded = df['BODY_Build'].str.contains("Unfolded")
# #IF value is True, return index of row
# folded_indices = df[contains_folded].index.tolist()
# #calculate mean of BODY_Dimensions and BODY_Weight for foldable phones 
# mean_body_dimensions = df.iloc[folded_indices]['BODY_Dimensions'].mean()
# #replace foldable phones BODY_Dimensions with mean
# df.loc[folded_indices, 'BODY_Dimensions'] = mean_body_dimensions



In [27]:
def body_dim_volume_calc(dimensions):
    # Extract the numerical values from the string
    if pd.isnull(dimensions) or dimensions == '-' or ("thickness" in dimensions):
        return np.nan
    dims = re.findall(r'(\d+\.?\d*)', dimensions)
    # Convert the strings to floats and calculate the volume (2 decimal places)
    volume = float(dims[0]) * float(dims[1]) * float(dims[2])
    volume = round(volume, 2)
    return volume

dim_test = "238.3 x 172.2 x 9.9 mm (9.38 x 6.78 x 0.39 in)"
print(body_dim_volume_calc(dim_test))

406249.07


In [28]:
df['BODY_Dimensions']

0                                           238.3 x 172.2 x 9.9 mm (9.38 x 6.78 x 0.39 in)
1                                             191.7 x 101 x 9.4 mm (7.55 x 3.98 x 0.37 in)
2                                            153.8 x 75.6 x 8.5 mm (6.06 x 2.98 x 0.33 in)
3                                            145.5 x 72.5 x 8.5 mm (5.73 x 2.85 x 0.33 in)
4                                              259 x 167 x 8.9 mm (10.20 x 6.57 x 0.35 in)
5                                            153.3 x 78.8 x 8.5 mm (6.04 x 3.10 x 0.33 in)
6                                                 154 x 77 x 10 mm (6.06 x 3.03 x 0.39 in)
7                                            145.7 x 71.2 x 8.4 mm (5.74 x 2.80 x 0.33 in)
8                                              217.9 x 127 x 8.6 mm (8.58 x 5.0 x 0.34 in)
9                                            156.5 x 75.9 x 8.4 mm (6.16 x 2.99 x 0.33 in)
10                                             136 x 66.5 x 9.6 mm (5.35 x 2.62 x 0.38 in)

## testing w/ models

In [29]:
#print corr of each feature with price
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

  corr = df.corr()


Unnamed: 0,Brand,NETWORK_Technology,NETWORK_2G_bands,NETWORK_3G_bands,NETWORK_4G_bands,NETWORK_5G_bands,NETWORK_GPRS,NETWORK_EDGE,NETWORK_Speed,MAIN_CAM_1_Module,SELFIE_CAM_2_Module,MISC_Price_Euro
Brand,1.0,0.103441,0.24791,0.080666,0.31738,0.046973,0.035706,0.066239,0.111682,-0.05641,0.08733,0.228486
NETWORK_Technology,0.103441,1.0,0.305557,0.446449,0.408463,0.325187,0.443051,0.440853,0.650622,0.057994,0.572038,0.171156
NETWORK_2G_bands,0.24791,0.305557,1.0,0.171482,0.237639,0.120422,0.229076,0.179766,0.22297,0.122573,0.298953,0.134723
NETWORK_3G_bands,0.080666,0.446449,0.171482,1.0,0.284751,0.238735,0.458639,0.458851,0.313984,0.154817,0.42235,0.153665
NETWORK_4G_bands,0.31738,0.408463,0.237639,0.284751,1.0,0.106517,0.172184,0.172522,0.396095,0.043035,0.240703,0.486711
NETWORK_5G_bands,0.046973,0.325187,0.120422,0.238735,0.106517,1.0,0.095388,0.101508,0.17764,0.045191,0.125875,0.118006
NETWORK_GPRS,0.035706,0.443051,0.229076,0.458639,0.172184,0.095388,1.0,0.90678,0.314166,0.215551,0.556152,0.06789
NETWORK_EDGE,0.066239,0.440853,0.179766,0.458851,0.172522,0.101508,0.90678,1.0,0.323234,0.236773,0.556973,0.066358
NETWORK_Speed,0.111682,0.650622,0.22297,0.313984,0.396095,0.17764,0.314166,0.323234,1.0,0.10016,0.395264,0.216352
MAIN_CAM_1_Module,-0.05641,0.057994,0.122573,0.154817,0.043035,0.045191,0.215551,0.236773,0.10016,1.0,0.345859,-0.015909


NETWORK_GPRS + NETWORK_EDGE khá cùi pắp

In [30]:
#testing with model
from sklearn.model_selection import train_test_split
from sklearn.utils import all_estimators
from sklearn.base import RegressorMixin
import lazypredict
from lazypredict.Supervised import LazyRegressor
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error, r2_score

In [31]:
regressors = config['regressors']
#removed regressors are those not in regressors 
removed_regressors = [est[0] for est in all_estimators() if (est[0] not in regressors)]

regressor_list = [
    est
    for est in all_estimators()
    if (issubclass(est[1], RegressorMixin) and (est[0] not in removed_regressors))
]

In [59]:
X = df[['Brand', 'NETWORK_2G_bands', 'NETWORK_3G_bands', 'NETWORK_4G_bands', 'NETWORK_5G_bands', 'NETWORK_Speed','NETWORK_Technology']]
y = df['MISC_Price_Euro']
X, y = shuffle(X, y, random_state=222)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None, regressors=regressor_list)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

'tuple' object has no attribute '__name__'
Invalid Regressor(s)


  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:02<00:00,  3.48it/s]


In [60]:
models 

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,0.61,0.62,535.37,0.03
GradientBoostingRegressor,0.51,0.52,599.85,0.34
LinearRegression,0.35,0.35,693.92,0.01
Ridge,0.35,0.35,693.95,0.01
Lasso,0.35,0.35,694.41,0.01
SGDRegressor,0.35,0.35,694.73,0.01
RandomForestRegressor,0.33,0.34,702.87,1.5
ElasticNet,0.27,0.27,736.01,0.01
ExtraTreesRegressor,0.18,0.19,778.04,0.9
DecisionTreeRegressor,0.14,0.15,797.52,0.03


In [62]:
predictions

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
KNeighborsRegressor,0.61,0.62,535.37,0.03
GradientBoostingRegressor,0.51,0.52,599.85,0.34
LinearRegression,0.35,0.35,693.92,0.01
Ridge,0.35,0.35,693.95,0.01
Lasso,0.35,0.35,694.41,0.01
SGDRegressor,0.35,0.35,694.73,0.01
RandomForestRegressor,0.33,0.34,702.87,1.5
ElasticNet,0.27,0.27,736.01,0.01
ExtraTreesRegressor,0.18,0.19,778.04,0.9
DecisionTreeRegressor,0.14,0.15,797.52,0.03
