In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.neural_network import MLPClassifier

In [2]:
data_df = pd.read_csv('C:/Users/DELL/Desktop/HK1_2019-2020/DataScience/FinalProject/DS_Final_Project/data/mobile_dataset.csv', sep='\t')
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2811 entries, 0 to 2810
Data columns (total 27 columns):
Bluetooth              2811 non-null int64
Width                  2781 non-null float64
Height                 2781 non-null float64
Thick                  2781 non-null float64
Weight                 2646 non-null float64
Memory card            2811 non-null object
OS                     2811 non-null object
SoC                    2811 non-null object
Wi-Fi                  2811 non-null int64
GPU core               2811 non-null object
CPU core               2811 non-null object
SIM type               2811 non-null object
Number of SIM cards    2811 non-null object
USB type               2811 non-null object
USB version            2811 non-null object
Position tracking      2811 non-null int64
Display size           2811 non-null float64
Display resolution     2811 non-null float64
Display color depth    2811 non-null object
Image resolution       2801 non-null float64
Video res

In [3]:
data_df = data_df.replace('nan cards', 'No info')
data_df = data_df.replace('version nan', 'No info')

In [4]:
# Tách X và y
y_df = data_df["Price range"] 
X_df = data_df.drop("Price range", axis=1)

In [5]:
# tách ra tỉ lệ train và test
train_X, test_X, train_Y, test_Y = train_test_split(X_df, y_df, test_size=0.1, 
                                                              stratify=y_df, random_state=0)

In [6]:
# tách ra trong tập train tỉ lệ validation và train
train_X_df, validation_X_df, train_Y_df, validation_Y_df = train_test_split(train_X, train_Y, test_size=0.05, 
                                                              stratify=train_Y, random_state=0)

In [7]:
num_cols = ['Width', 'Height', 'Thick', 'Weight', 'Display size', 'Display resolution', 'Image resolution',
           'Video resolution', 'FPS', 'Battery power', 'RAM', 'Storage']
cat_cols = ['Memory card', 'OS', 'SoC', 'GPU core', 'CPU core', 'SIM type', 'Number of SIM cards', 'USB type', 'USB version',
           'Display color depth', 'Battery type']

In [8]:
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(missing_values = np.float64('NaN'), strategy='mean'))])

categorical_transformer = Pipeline(
    steps=[('imputer', SimpleImputer(missing_values = 'No info', strategy='most_frequent')),
           ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, num_cols),
                                               ('cat', categorical_transformer, cat_cols)])

preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('scaler', StandardScaler(with_mean=False))])

In [9]:
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('NeuralNet', MLPClassifier(hidden_layer_sizes=(20), activation='tanh', 
                                                            solver='lbfgs', random_state=0, max_iter=500))])

train_accuracy = []
val_accuracy = []
test_accuracy = []
alphas = [0.1, 0.5, 1, 10, 100, 1000]

for alpha in alphas:
    full_pipeline.set_params(NeuralNet__alpha = alpha)
    full_pipeline.fit(train_X_df, train_Y_df)
    train_accuracy.append(full_pipeline.score(train_X_df, train_Y_df) * 100)
    val_accuracy.append(full_pipeline.score(validation_X_df, validation_Y_df) * 100)
    test_accuracy.append(full_pipeline.score(test_X, test_Y) * 100)

print(train_accuracy)
print(val_accuracy)
print(test_accuracy)

[94.00499583680266, 94.67110741049126, 94.96253122398002, 92.5895087427144, 86.76103247293922, 82.59783513738552]
[81.10236220472441, 81.10236220472441, 78.74015748031496, 82.67716535433071, 83.46456692913385, 82.67716535433071]
[86.87943262411348, 86.52482269503547, 84.75177304964538, 86.87943262411348, 89.00709219858156, 82.62411347517731]


In [10]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('LogReg', LogisticRegression(multi_class = 'multinomial', solver = 'sag',  max_iter = 10000))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X_df, validation_Y_df) * 100)
print('Test accuracy: ', full_pipeline.score(test_X, test_Y) * 100)

Train accuracy:  89.17568692756038
Validation accuracy:  83.46456692913385
Test accuracy:  87.2340425531915


In [11]:
# Deision Tree
from sklearn.tree import DecisionTreeClassifier

full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('Decision_Tree', DecisionTreeClassifier(random_state=101))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X_df, validation_Y_df) * 100)
print('Test accuracy: ', full_pipeline.score(test_X, test_Y) * 100)

Train accuracy:  99.91673605328893
Validation accuracy:  80.31496062992126
Test accuracy:  82.26950354609929


In [12]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('Random_Forest', RandomForestClassifier(n_estimators = 100, random_state=101, criterion = 'entropy', oob_score = True))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X_df, validation_Y_df) * 100)
print('Test accuracy: ', full_pipeline.score(test_X, test_Y) * 100)

Train accuracy:  99.91673605328893
Validation accuracy:  85.03937007874016
Test accuracy:  86.52482269503547


In [13]:
# KNN
from sklearn.neighbors import KNeighborsClassifier

full_pipeline = Pipeline(steps = [('preprocess_pipeline', preprocess_pipeline),
                                 ('KNN', KNeighborsClassifier(n_neighbors=3))])

full_pipeline.fit(train_X_df, train_Y_df)
print('Train accuracy: ', full_pipeline.score(train_X_df, train_Y_df) * 100)
print('Validation accuracy: ', full_pipeline.score(validation_X_df, validation_Y_df) * 100)
print('Test accuracy: ', full_pipeline.score(test_X, test_Y) * 100)

Train accuracy:  90.3413821815154
Validation accuracy:  81.10236220472441
Test accuracy:  85.81560283687944
