In [1]:
# place imports here
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD
from keras.utils.np_utils import to_categorical
from keras.wrappers.scikit_learn import KerasClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score, confusion_matrix, classification_report, accuracy_score

Using TensorFlow backend.


In [2]:
dataset = pd.read_csv('../dataset/AppleStoreWithoutNoise.csv')
dataset.describe()

Unnamed: 0,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver
count,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0
mean,205743000.0,1.821977,14803.81,528.607371,4.049697,3.735801
std,352634100.0,6.128238,80984.68,4196.699812,0.726943,1.39904
min,589824.0,0.0,1.0,0.0,1.0,0.0
25%,51719420.0,0.0,78.0,6.0,4.0,3.5
50%,102129200.0,0.0,512.5,37.0,4.5,4.5
75%,188477400.0,2.99,3963.5,176.0,4.5,4.5
max,4025970000.0,299.99,2974676.0,177050.0,5.0,5.0


In [3]:
dataset['size_100kilo_bytes'] = dataset['size_bytes']/102400
dataset['price_100'] = dataset['price'] / 100;

mms = MinMaxScaler()
dataset['price_mms'] = mms.fit_transform(dataset[['price']])
dataset['rating_count_tot_mms'] = mms.fit_transform(dataset[['rating_count_tot']])
dataset['rating_count_ver_mms'] = mms.fit_transform(dataset[['rating_count_ver']])

In [4]:
dataset.describe()

Unnamed: 0,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,user_rating_ver,size_100kilo_bytes,price_100,price_mms,rating_count_tot_mms,rating_count_ver_mms
count,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0,6268.0
mean,205743000.0,1.821977,14803.81,528.607371,4.049697,3.735801,2009.209099,0.01822,0.006073,0.004976,0.002986
std,352634100.0,6.128238,80984.68,4196.699812,0.726943,1.39904,3443.69286,0.061282,0.020428,0.027225,0.023703
min,589824.0,0.0,1.0,0.0,1.0,0.0,5.76,0.0,0.0,0.0,0.0
25%,51719420.0,0.0,78.0,6.0,4.0,3.5,505.0725,0.0,0.0,2.6e-05,3.4e-05
50%,102129200.0,0.0,512.5,37.0,4.5,4.5,997.355,0.0,0.0,0.000172,0.000209
75%,188477400.0,2.99,3963.5,176.0,4.5,4.5,1840.6,0.0299,0.009967,0.001332,0.000994
max,4025970000.0,299.99,2974676.0,177050.0,5.0,5.0,39316.11,2.9999,1.0,1.0,1.0


In [5]:
X_npy = dataset[['price_mms', 'rating_count_tot_mms', 'rating_count_ver_mms', 'size_100kilo_bytes']]
X_npy.describe()

Unnamed: 0,price_mms,rating_count_tot_mms,rating_count_ver_mms,size_100kilo_bytes
count,6268.0,6268.0,6268.0,6268.0
mean,0.006073,0.004976,0.002986,2009.209099
std,0.020428,0.027225,0.023703,3443.69286
min,0.0,0.0,0.0,5.76
25%,0.0,2.6e-05,3.4e-05,505.0725
50%,0.0,0.000172,0.000209,997.355
75%,0.009967,0.001332,0.000994,1840.6
max,1.0,1.0,1.0,39316.11


In [6]:
y_npy = dataset[['user_rating']]
y_npy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6268 entries, 0 to 6267
Data columns (total 1 columns):
user_rating    6268 non-null float64
dtypes: float64(1)
memory usage: 49.0 KB


In [7]:
pd.options.mode.chained_assignment = None
y_npy['happy'] = np.where(y_npy['user_rating'] >= 4.5, 1, 0)
y_npy['unhappy'] = np.where(y_npy['user_rating'] < 4.5, 1, 0)
y_npy.head()

Unnamed: 0,user_rating,happy,unhappy
0,4.0,0,1
1,4.0,0,1
2,3.5,0,1
3,4.0,0,1
4,4.5,1,0


In [8]:
X = X_npy.values
y = y_npy[['happy', 'unhappy']].values

In [9]:
y.shape

(6268, 2)

In [15]:
def model_builder_sigmoid():
    model = Sequential()
    model.add(Dense(2, input_shape=(4,), activation='sigmoid'))
    model.compile(SGD(lr=0.2), 
                  'binary_crossentropy', 
                  metrics=['accuracy'])
    return model

In [16]:
model = KerasClassifier(build_fn=model_builder_sigmoid, epochs=50, verbose=0)

In [17]:
cv = KFold(3, shuffle=True)
scores = cross_val_score(model, X, y, cv=cv)
print("Cross validation accuracy is {:0.4f} +- {:0.4f}".format(scores.mean(), scores.std()))

Cross validation accuracy is 0.4979 +- 0.0029


In [12]:
X.shape

(6268, 4)

In [13]:
y.shape

(6268, 1)

In [14]:
model = Sequential()
model.add(Dense(1, input_dim=4, activation='sigmoid'))
model.compile(Adam(lr=0.5), 'binary_crossentropy', metrics=['accuracy'])

In [15]:
model.fit(X_train, y_train, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1d8004a0e48>

In [58]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print("The Accuracy score on the Train set is:\t{:0.3f}".format(accuracy_score(y_train, y_train_pred.round())))
print("The Accuracy score on the Test set is:\t{:0.3f}".format(accuracy_score(y_test, y_test_pred.round())))

The Accuracy score on the Train set is:	0.925
The Accuracy score on the Test set is:	0.906
