In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
data_df = pd.read_csv('C:/Users/DELL/Desktop/HK1_2019-2020/DataScience/FinalProject/DS_Final_Project/data/mobile_dataset.csv', sep='\t')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811 entries, 0 to 2810
Data columns (total 27 columns):
Bluetooth              2811 non-null int64
Width                  2781 non-null float64
Height                 2781 non-null float64
Thick                  2781 non-null float64
Weight                 2646 non-null float64
Memory card            2811 non-null object
OS                     2811 non-null object
SoC                    2811 non-null object
Wi-Fi                  2811 non-null int64
GPU core               2811 non-null object
CPU core               2811 non-null object
SIM type               2811 non-null object
Number of SIM cards    2811 non-null object
USB type               2811 non-null object
USB version            2811 non-null object
Position tracking      2811 non-null int64
Display size           2811 non-null float64
Display resolution     2811 non-null float64
Display color depth    2811 non-null object
Image resolution       2801 non-null float64
Video res

Thay thế các giá trị rỗng bằng 'No info' để chuẩn hóa lúc sau

In [3]:
data_df = data_df.replace('nan cards', 'No info')
data_df = data_df.replace('version nan', 'No info')

## Tách dữ liệu thành 2 phần: label và tập train

In [4]:
# Tách X và y
y_df = data_df["Price range"] 
X_df = data_df.drop("Price range", axis=1)

Tỉ lệ tập test là 20% của toàn bộ dataset

In [5]:
# tách ra tỉ lệ train và test
train_and_validation_X_df, test_X, train_and_validation_Y_df, test_Y = train_test_split(X_df, y_df, test_size=0.2, 
                                                                                        stratify=y_df, random_state=0)

Tỉ lệ tập validation là 20% toàn bộ dataset tương ứng 25% train_and_validation_X_df


In [6]:
# tách tập train_and_validation thành train và validation
train_X_df, validation_X, train_Y_df, validation_Y = train_test_split(train_and_validation_X_df, train_and_validation_Y_df, 
                                                            test_size=0.25, stratify=train_and_validation_Y_df, random_state=0)

## Chuẩn hóa bộ dữ liệu

In [7]:
num_cols = ['Width', 'Height', 'Thick', 'Weight', 'Display size', 'Display resolution', 'Image resolution',
           'Video resolution', 'FPS', 'Battery power', 'RAM', 'Storage']
cat_cols = ['Memory card', 'OS', 'SoC', 'GPU core', 'CPU core', 'SIM type', 'Number of SIM cards', 'USB type', 'USB version',
           'Display color depth', 'Battery type']

- Đối với các cột dạng numeric, ta sử dụng mean của cột để điền vào những giá trị còn thiếu
- Đối với các cột dạng categorical, ta sử dụng giá trị xuất hiện nhiều nhất trong cột để điền vào những giá trị còn thiếu

In [8]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(missing_values = np.float64('NaN'), strategy='mean'))])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(missing_values = 'No info', strategy='most_frequent')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_cols),
                                               ('cat', categorical_transformer, cat_cols)])

preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('scaler', StandardScaler(with_mean=False))])

## Thử nghiệm với các thuật toán khác nhau

### MLP Classifier

In [9]:
# MLP Classifier
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('NeuralNet', MLPClassifier(hidden_layer_sizes=(20), activation='tanh', 
                                                            solver='lbfgs', random_state=0, max_iter=1000))])

train_accuracy = []
validation_accuracy = []
alphas = [0.1, 0.5, 1, 10, 100, 1000]

for alpha in alphas:
    full_pipeline.set_params(NeuralNet__alpha = alpha)
    full_pipeline.fit(train_X_df, train_Y_df)
    train_accuracy.append(full_pipeline.score(train_X_df, train_Y_df) * 100)
    validation_accuracy.append(full_pipeline.score(validation_X, validation_Y) * 100)

print(train_accuracy)
print(validation_accuracy)

[93.3570581257414, 97.80545670225386, 98.75444839857651, 93.95017793594306, 86.1803084223013, 82.62158956109134]
[85.05338078291815, 84.51957295373666, 83.98576512455516, 85.40925266903915, 85.76512455516014, 82.56227758007117]


### Logistic Regression

In [10]:
# Logistic Regression
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('LogReg', LogisticRegression(multi_class = 'multinomial', solver = 'sag',  max_iter = 10000))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X, validation_Y) * 100)

Train accuracy:  88.67141162514828
Validation accuracy:  85.76512455516014


### Decision Tree

In [11]:
# Deision Tree
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('Decision_Tree', DecisionTreeClassifier(random_state=101))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X, validation_Y) * 100)

Train accuracy:  100.0
Validation accuracy:  79.1814946619217


### Random Forest

In [12]:
# Random Forest
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('Random_Forest', RandomForestClassifier(n_estimators = 100, random_state=101, criterion = 'entropy', oob_score = True))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X, validation_Y) * 100)

Train accuracy:  100.0
Validation accuracy:  85.23131672597864


### KNN

In [13]:
# KNN
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('KNN', KNeighborsClassifier(n_neighbors=3))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X, validation_Y) * 100)

Train accuracy:  90.0355871886121
Validation accuracy:  83.98576512455516


## Đánh giá lựa chọn thuật toán

Ta thấy rằng thuật toán Random Forest có độ chính xác ổn nhất trong các thuật toán ở trên khi xét ở cả 2 tập train và test.

Do đó, ta sẽ thử nghiệm với các tham số khác nhau của thuật toán Random Forest

In [None]:

n_es=[x for x in range(100,1050,50)]
criterions=['entropy','gini']
max_f=['auto','log2',None]
#print(n_es)
#n_estimators = 100,random_state=101, criterion = 'entropy'
for criterion in criterions:
  print(criterion)
  for f in max_f:
    print(f)
    for n in n_es:
      print(n)
      full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('Random_Forest', RandomForestClassifier(oob_score = True,random_state=40))])
      full_pipeline.set_params(Random_Forest__n_estimators=n,Random_Forest__criterion=criterion,
                                 Random_Forest__max_features=f,Random_Forest__random_state=54)
      full_pipeline.fit(train_X_df, train_Y_df)
      print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
      print('Test accuracy: ', full_pipeline.score(validation_X, validation_Y) * 100)


entropy
auto
100
Train accuracy:  100.0
Test accuracy:  85.76512455516014
150
Train accuracy:  100.0
Test accuracy:  85.40925266903915
200
Train accuracy:  100.0
Test accuracy:  85.94306049822063
250
Train accuracy:  100.0
Test accuracy:  85.76512455516014
300
Train accuracy:  100.0
Test accuracy:  85.94306049822063
350
Train accuracy:  100.0
Test accuracy:  85.76512455516014
400
Train accuracy:  100.0
Test accuracy:  85.58718861209964
450
Train accuracy:  100.0
Test accuracy:  85.76512455516014
500
Train accuracy:  100.0
Test accuracy:  85.76512455516014
550
Train accuracy:  100.0
Test accuracy:  85.76512455516014
600
Train accuracy:  100.0
Test accuracy:  85.76512455516014
650
Train accuracy:  100.0
Test accuracy:  86.12099644128114
700
Train accuracy:  100.0
Test accuracy:  86.12099644128114
750
Train accuracy:  100.0
Test accuracy:  86.12099644128114
800
Train accuracy:  100.0
Test accuracy:  86.12099644128114
850
Train accuracy:  100.0
Test accuracy:  85.76512455516014
900
Train a