In [1]:
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd

In [2]:
def normalize(X):
    _min = np.min(X, axis=0)
    _max = np.max(X, axis=0)
    _range = _max - _min
    norm_X = 1 - (_max - X)/_range
    return norm_X

In [3]:
df = pd.read_csv('mobile_price/train.csv')
df.shape

(2000, 21)

In [4]:
df.columns

Index(['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g',
       'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g',
       'touch_screen', 'wifi', 'price_range'],
      dtype='object')

In [5]:
logisticReg = LogisticRegression(tol=0.05, max_iter=10000, multi_class='multinomial')

In [6]:
X = df[df.columns[:11]].values

In [7]:
X = normalize(X)
X

array([[0.22778891, 0.        , 0.68      , ..., 0.9       , 0.14285714,
        0.1       ],
       [0.34736139, 1.        , 0.        , ..., 0.46666667, 0.28571429,
        0.3       ],
       [0.04141617, 1.        , 0.        , ..., 0.54166667, 0.57142857,
        0.3       ],
       ...,
       [0.94188377, 0.        , 0.16      , ..., 0.23333333, 1.        ,
        0.15      ],
       [0.6753507 , 0.        , 0.16      , ..., 0.54166667, 0.57142857,
        0.25      ],
       [0.00601202, 1.        , 0.6       , ..., 0.73333333, 0.71428571,
        0.8       ]])

In [8]:
Y = df[df.columns[-1]].values

In [9]:
np.random.seed(1)
p = np.random.permutation(len(X))

In [10]:
x_train = X[p[:int(len(X)*0.65)]].copy()
y_train = Y[p[:int(len(X)*0.65)]].copy()

In [11]:
x_test = X[p[int(len(X)*0.65):]].copy()
y_test = Y[p[int(len(X)*0.65):]].copy()

In [12]:
logisticReg.fit(x_train, y_train)

LogisticRegression(max_iter=10000, multi_class='multinomial', tol=0.05)

In [13]:
y_predict = logisticReg.predict(x_test)

In [14]:
num_correct = len(y_predict[y_predict!=y_test])
num_total = len(y_predict)
print("True predict = {}/{}".format(num_correct, num_total))
print("Logistic Regression model accuracy: {}%".format(round(100*num_correct/num_total, 2)))

True predict = 512/700
Logistic Regression model accuracy: 73.14%


In [15]:
def f1_statistic(Y_truth, Y_predict):
    Y_set = list(set(Y_truth))
    length = len(Y_set)
#     print(length)
    f1 = np.zeros([length, length])
#     print(f1.size)
    for i in range(len(Y_truth)):
        f1[Y_predict[i], Y_truth[i]] += 1
    return f1

In [16]:
def f1_value(f1):
    score = []
    for i in range(f1.shape[0]):
        pt_at = f1[i,i]                             # predict: true - actual: true
        pt_af = np.sum(f1,axis=1)[i] - pt_at        # predict: true - actual: false
        pf_at = np.sum(f1,axis=0)[i] - pt_at        # predict: false - actual: true
        pf_af = np.sum(f1) - (pt_at+pt_af+pf_at)    # predict: false - actual: false
        
        precision = 100*pt_at/(pt_at+pt_af) # %
        recall = 100*pt_at/(pt_at+pf_at)    # %
        f1_score = 2*precision*recall/(precision+recall)    # %
        score.append([precision, recall, f1_score])
    return np.array(score)

In [17]:
f1_value(f1_statistic(y_test, y_predict))

array([[26.03550296, 23.28042328, 24.58100559],
       [26.53061224, 14.20765027, 18.50533808],
       [20.85561497, 25.16129032, 22.80701754],
       [32.11382114, 45.66473988, 37.70883055]])