In [1269]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import category_encoders as ce
import random
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV
import joblib
import numpy as np


In [1270]:
products_df = pd.read_csv('data/products_full.csv')  
labels_df = pd.read_csv('data/product_labels.csv')  
products_df['y'] = labels_df['label']

# check product dataframe head
#products_df.head()

In [1300]:
products_df.columns

Index(['id', 'sold', 'url', 'name', 'current_price', 'original_price',
       'discount_price', 'average_rating', 'rating_count', 'five_star',
       'four_star', 'three_star', 'two_star', 'one_star', 'seller_rating',
       'chat_response', 'ship_time_rate', 'specs', 'y', 'rom', 'brand',
       'num_camera', 'ppi', 'vid_resolution', 'ram', 'screen_size_inch',
       'phone_type', 'battery_capacity', 'warranty', 'resolution', 'condition',
       'warranty_type', 'operating_system'],
      dtype='object')

In [1272]:
# remove non-numeric characters sold column
# convert sold column to int
pattern = r'[^\d]'
sold_column = products_df['sold'].str.replace(pattern, '', regex=True)
products_df['sold'] = sold_column.astype(int)

In [1273]:
# remove non-numeric characters current price column
# convert current price to float
pattern = r'[^\d.]'
current_price_column = products_df['current_price'].str.replace(pattern, '', regex=True).astype(float)
products_df.current_price = current_price_column

In [1274]:
# remove non-numeric characters discount_price column
# convert discount price to float
pattern = r'[^\d]'
discount_price_column = products_df['discount_price'].str.replace(pattern, '', regex=True).astype(float) / 100
products_df['discount_price'] = discount_price_column

In [1275]:
# remove non-numeric characters ratings count column
# convert ratings count to int
pattern = r'[^\d]'
products_df['rating_count'] = products_df['rating_count'].str.replace(pattern, '', regex=True).astype(int)

In [1276]:
# remove non-numeric characters seller_rating, chat_response, and ship_time_rate
# convert seller_rating, chat_response, and ship_time_rate
pattern = r'[^\d]'
products_df['seller_rating'] = products_df['seller_rating'].str.replace(pattern, '', regex=True).astype(str)
products_df['chat_response'] = products_df['chat_response'].str.replace(pattern, '', regex=True).astype(str)
products_df['ship_time_rate'] = products_df['ship_time_rate'].str.replace(pattern, '', regex=True).astype(str)

In [1277]:
# set empty values of seller_rating, chat_response, and ship_time_rate to '0'
products_df.loc[products_df['seller_rating'] == '', 'seller_rating'] = '0'
products_df.loc[products_df['chat_response'] == '', 'chat_response'] = '0'
products_df.loc[products_df['ship_time_rate'] == '', 'ship_time_rate'] = '0'

In [1278]:
# convert to seller_rating, chat_response, and ship_time_rate to decimal
products_df['seller_rating'] = products_df['seller_rating'].astype(float)  
products_df['chat_response'] = products_df['chat_response'].astype(float)
products_df['ship_time_rate'] = products_df['ship_time_rate'].astype(float) 
#products_df.head()

In [1279]:
# set NaN values to 'no value'

products_df.loc[products_df['specs'].isnull(), 'specs'] = '0'
products_df.loc[products_df['name'].isnull(), 'name'] = '0'
products_df['specs'] = products_df['specs'].astype(str)

In [1280]:
# remove non-ascii characters from specs
pattern = r'[^\x00-\x7F]'
products_df['specs'] = products_df['specs'].str.replace(pattern, '', regex=True).astype(str)

In [1281]:
# remove non-ascii characters from product name
pattern = r'[^\x00-\x7F]'
products_df['name'] = products_df['name'].str.replace(pattern, '', regex=True).astype(str)

# lowercase name feature
products_df['name'] = products_df['name'].str.lower()

# remove whitespace name feature
products_df['name'] = products_df['name'].str.replace(' ', '')

#products_df['name']


In [1282]:
# engineer rom feature from name feature

products_df['rom'] = 'unknown'
products_df.loc[products_df['name'].str.contains('512gb'), 'rom'] = '512gb'
products_df.loc[products_df['name'].str.contains('256gb'), 'rom'] = '256gb'
products_df.loc[products_df['name'].str.contains('128gb'), 'rom'] = '64gb-128gb'
products_df.loc[products_df['name'].str.contains('64gb'), 'rom'] = '64gb-128gb'
products_df.loc[products_df['name'].str.contains('32gb'), 'rom'] = '<64gb'

In [1283]:
# generate new features from specs feature
columns = ['brand', 'num_camera', 'ppi', 'vid_resolution', 'ram', 'screen_size_inch', 'phone_type', 
           'battery_capacity', 'warranty', 'resolution', 'condition', 'warranty_type', 'operating_system']

patterns = [r'Brand\t[^\t]+', r'Number_of_Camera\t[^\t]+', r'PPI\t[^\t]+', r'Video Resolution\t[^\t]+', 
            r'RAM Memory [(]Gb[)]\t[^\t]+', r'Screen Size [(]inches[)]\t[^\t]+', r'Phone Type\t[^\t]+',
            r'Battery Capacity\t[^\t]+', r'warranty\t[^\t]+', r'Resolution\t[^\t]+', r'Condition\t[^\t]+',
            r'Warranty Type\t[^\t]+', r'Operating System\t[^\t]+']

specs_prefix = ['Brand\t', 'Number_of_Camera\t', 'PPI\t', 'Video Resolution\t', 
                'RAM Memory (Gb)\t', 'Screen Size (inches)\t', 'Phone Type\t', 'Battery Capacity\t',
                'warranty\t', 'Resolution\t', 'Condition\t', 'Warranty Type\t', 'Operating System\t']

for i in range(len(columns)):
    col_series = products_df['specs'].str.findall(patterns[i]).str.join('').str.replace(specs_prefix[i], '')
    products_df[columns[i]] = col_series

In [1284]:
# strip and lower feature values
columns = ['brand', 'num_camera', 'ppi', 'vid_resolution', 'ram', 'screen_size_inch', 'phone_type', 
           'battery_capacity', 'warranty', 'resolution', 'condition', 'warranty_type', 'operating_system']

for col in columns:
    products_df[col] = products_df[col].str.lower().str.strip().replace('', '0')

In [1285]:
# pre-processing brand feature

brands = ['huawei', 'philips', 'no brand', 'vivo', 'nokia',
       'samsung', 'oppo', 'cherry', 'telego', '0', 'lenovo', 'xiaomi', 'hyundai', 'realme',
        'motorola', 'tecno','infinix', 'poco', 'meizu', 'oneplus', 'myphone', 'lg', 'apple']

# change not popular and intriguing brands to 'others'
products_df.loc[~products_df['brand'].isin(brands), 'brand'] = 'others'

# change '0' brands to 'no brand' 
products_df.loc[products_df['brand'].eq('0'), 'brand'] = 'no brand'

#print('brand classes:')
#print(products_df['brand'].unique())


In [1286]:
# pre-processing num_camera feature 

cameras = ['0', 'single', 'dual', 'triple', 'quad', 
       'zero', 'none']

no_cameras = ['0', 'zero', 'none']

# set zero or no cameras to 'zero'
products_df.loc[products_df['num_camera'].isin(no_cameras), 'num_camera'] = 'zero'

# set other values of num_cameras to 'others' 
products_df.loc[~products_df['num_camera'].isin(cameras), 'num_camera'] = 'unknown'

#print('number of camera classes:')
#print(products_df['num_camera'].unique())


In [1287]:
# pre-processing RAM feature

products_df['ram'].unique()
ram_classes = ['<2gb', '2gb-4gb', '5gb-8gb', '>8gb', 'unknown']

# <2gb
products_df.loc[products_df['ram'].isin(['1gb', '512mb & under', '512']), 'ram'] = ram_classes[0]

# '2gb-4gb'
products_df.loc[products_df['ram'].isin(['2gb', '4gb', '4/64', '3gb', '3', '3gb or 4gb']), 'ram'] = ram_classes[1]

'5gb-8gb'
products_df.loc[products_df['ram'].isin(['8gb', '6gb', '8']), 'ram'] = ram_classes[2]

'>8gb'
products_df.loc[products_df['ram'].isin(['8gb+5gb', '16gb', '12gb', '12g']), 'ram'] = ram_classes[3]

'0'
products_df.loc[products_df['ram'].isin([ '0', 'other',]), 'ram'] = ram_classes[4]

#print('RAM classes:')
#print(products_df['ram'].unique())

In [1288]:
# pre-processing warranty feature

products_df.loc[products_df['warranty'].str.contains('year'), 'warranty'] = ">= 1year"
products_df.loc[products_df['warranty'].str.contains('month'), 'warranty'] = ">= 1month < 1year"
products_df.loc[products_df['warranty'].str.contains('day'), 'warranty'] = "< 1 month"
products_df.loc[products_df['warranty'] == '0', 'warranty'] = "no warranty"
#products_df['warranty'].unique()


In [1289]:
# pre-processing warranty_type feature

products_df.loc[products_df['warranty_type'].str.contains('local'), 'warranty_type'] = "local warranty"
products_df.loc[products_df['warranty_type'].str.contains('international'), 'warranty_type'] = "international warranty"
products_df.loc[products_df['warranty_type'] == '0', 'warranty_type'] = "no warranty"
#products_df['warranty_type'].unique()

In [1290]:
products_df.loc[products_df['battery_capacity']
                .isin(['under 1000 mah', '1000 - 1999 mah', '1000 mah to 5000 mah', '800 mah', 
                      '1500', '500 mah']), 'battery_capacity'] = "< 2000 mah"

products_df.loc[products_df['battery_capacity']
                .isin(['2000 - 2999 mah', '2800 mah', '3000 - 3999 mah', 'li-po 3400',
                      '3000', '3400', '3020', '3000 mah', '3340', '2500 mah', '3400mah',
                      '3070mah', '3300mah', '3340mah', '3260mah', 'li-ion 3080 mah', 
                      '3000mah', '3315 mah', '3260 mah', '3180 mah', '2630mah','3315mah']), 'battery_capacity'] = ">= 2000 mah < 4000 mah"

products_df.loc[products_df['battery_capacity']
                .isin(['50001 mah and up',  '6000 mah', '4000 mah', '4000 - 4999 mah', '5000 mah & above',
                'li-po 5000 mah',  '4000', '4230-5000 mah', '5800mah','4800 mah & above',
                '4400 mah', '5000mah',  '4500', '5000', '5000 mah', '4100 mah',
                '4230mah', '4230', '4500 mah', '4500mah', '4030 mah', '4020 mah', '4030']), 'battery_capacity'] = ">= 4000 mah < 6000 mah"

products_df.loc[products_df['battery_capacity']
                .isin(['10000mah', '10001 mah to 20000 mah', '6000', '6000mah']), 
                'battery_capacity'] = "6000 mah above"

products_df.loc[products_df['battery_capacity'] == '0', 'battery_capacity'] = "unknown"

In [1291]:
# remove samples that has not_applicable label
products_df = products_df[products_df["y"].str.contains("not applicable") == False]

In [1292]:
# create encoder for categorical data
categorical_features = ['brand', 'warranty', 'warranty_type', 'num_camera', 'battery_capacity', 'ram', 'rom']

features = categorical_features
X = products_df[features]
y = products_df['y']

# create categorical encoder
categorical_encoder = OrdinalEncoder()
X[categorical_features] = categorical_encoder.fit_transform(X[categorical_features])

# export categorical encoder
joblib.dump(categorical_encoder, 'categorical_encoder')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[categorical_features] = categorical_encoder.fit_transform(X[categorical_features])


['categorical_encoder']

In [1293]:
y = products_df['y'] 
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# export categorical encoder
joblib.dump(label_encoder, 'label_encoder')

['label_encoder']

In [1294]:
cat_encoder = joblib.load("categorical_encoder")
lab_encoder = joblib.load("label_encoder")

#result = cat_encoder.transform([['xiaomi', 'no warranty', 'no warranty', 'quad', '>= 4000 mah < 6000 mah', '>8gb', '64gb-128gb']])

In [1295]:
# model creation

numerical_features = ['current_price', 'discount_price', 'sold', 'average_rating', 'rating_count', 'five_star',
                      'four_star', 'three_star', 'two_star', 'one_star']
categorical_features = ['brand', 'warranty', 'warranty_type', 'num_camera', 'battery_capacity', 'ram', 'rom']

features = numerical_features + categorical_features
X = products_df[features]
y = products_df['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random.randint(0, 100))
X_train[categorical_features] = cat_encoder.transform(X_train[categorical_features])
X_test[categorical_features] = cat_encoder.transform(X_test[categorical_features])


y_train = lab_encoder.transform(y_train)
y_test = lab_encoder.transform(y_test)


classifier = RandomForestClassifier(random_state=random.randint(0, 100))

classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

# export model 
joblib.dump(classifier, 'inauth_prod_classifier')

# classifier benchmark scores 
print('Precision score: {0:0.2f}'. format(precision_score(y_test, y_pred, average='weighted') * 100))
print('Recall score: {0:0.2f}'. format(recall_score(y_test, y_pred, average='weighted')  * 100))
print('F1 score: {0:0.2f}'. format(f1_score(y_test, y_pred, average='weighted') * 100))
print('Accuracy score: {0:0.2f}'. format(accuracy_score(y_test, y_pred)  * 100))
print()

# get feature scores
feature_scores = pd.Series(classifier.feature_importances_, index=X_train.columns).sort_values(ascending=False)
print('feature scores:')
print(feature_scores)
print()



Precision score: 92.27
Recall score: 92.23
F1 score: 92.25
Accuracy score: 92.23

feature scores:
current_price       0.220641
discount_price      0.104064
sold                0.086334
rom                 0.067252
five_star           0.067181
rating_count        0.064967
brand               0.064030
four_star           0.059241
warranty            0.046391
three_star          0.044128
average_rating      0.033720
num_camera          0.031314
battery_capacity    0.030247
ram                 0.023577
one_star            0.022918
warranty_type       0.017472
two_star            0.016524
dtype: float64



In [1296]:
products_df[categorical_features]

Unnamed: 0,brand,warranty,warranty_type,num_camera,battery_capacity,ram,rom
0,huawei,< 1 month,local warranty,zero,unknown,2gb-4gb,unknown
1,philips,>= 1month < 1year,local warranty,single,< 2000 mah,unknown,unknown
2,philips,>= 1year,local warranty,single,< 2000 mah,5gb-8gb,unknown
3,philips,>= 1year,local warranty,single,< 2000 mah,unknown,unknown
4,no brand,>= 1month < 1year,local warranty,dual,< 2000 mah,unknown,unknown
...,...,...,...,...,...,...,...
1014,poco,>= 1year,local warranty,dual,>= 4000 mah < 6000 mah,>8gb,512gb
1015,oppo,>= 1year,local warranty,dual,>= 4000 mah < 6000 mah,>8gb,512gb
1016,tecno,>= 1year,local warranty,triple,>= 4000 mah < 6000 mah,2gb-4gb,64gb-128gb
1017,others,no warranty,no warranty,dual,>= 4000 mah < 6000 mah,unknown,512gb


In [1297]:
X.iloc[2].tolist()

[1709.0,
 0.19,
 680,
 4.6,
 155,
 127,
 10,
 9,
 4,
 5,
 'philips',
 '>= 1year',
 'local warranty',
 'single',
 '< 2000 mah',
 '5gb-8gb',
 'unknown']

In [1298]:
products_df['battery_capacity'].sort_values().unique()

array(['6000 mah above', '< 2000 mah', '>= 2000 mah < 4000 mah',
       '>= 4000 mah < 6000 mah', 'unknown'], dtype=object)

In [1299]:
"""
features:
numerical_features = ['current_price', 'discount_price', 'sold', 
                    'average_rating', 'rating_count', 'five_star',
                    'four_star', 'three_star', 'two_star', 'one_star']

categorical_features = ['brand', 'warranty', 'warranty_type', 'num_camera', 
                        'battery_capacity', 'ram', 'rom']
"""

"\nfeatures:\nnumerical_features = ['current_price', 'discount_price', 'sold', \n                    'average_rating', 'rating_count', 'five_star',\n                    'four_star', 'three_star', 'two_star', 'one_star']\n\ncategorical_features = ['brand', 'warranty', 'warranty_type', 'num_camera', \n                        'battery_capacity', 'ram', 'rom']\n"