# Ví dụ B.3 (Bài tập 1)

**a, Phân loại bằng phương pháp hồi quy Logistic**

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.metrics import accuracy_score, mean_squared_error, recall_score, precision_score

In [3]:
file_path = '../../data/Prac3/Admission_Predict.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [4]:
train_data = data.iloc[:350]
test_data = data.iloc[350:]

In [5]:
def preprocess(data):
    X = data.drop(columns=["Serial No.", "Chance of Admit"]).values
    y = (data["Chance of Admit"]).values  
    X = (X - np.mean(X, axis=0)) / np.std(X, axis=0)  
    Xbar = np.concatenate((np.ones((X.shape[0], 1)), X), axis=1).T  
    return Xbar, y

In [6]:
X_train, y_train = preprocess(train_data)
X_test, y_test = preprocess(test_data)

In [7]:
def sigmoid(s):
    return 1/(1 + np.exp(-s))

def logistic_sigmoid_regression(X, y, w_init, eta, tol = 1e-4, max_count = 10000):
    w = [w_init]
    it = 0
    N = X.shape[1]
    d = X.shape[0]
    count = 0
    check_w_after = 20
    # loop of stochastic gradient descent
    while count < max_count:
        # shuffle the order of data (for stochastic gradient descent).
        # and put into mix_id
        mix_id = np.random.permutation(N)
        for i in mix_id:
            xi = X[:, i].reshape(d, 1)
            yi = y[i]
            zi = sigmoid(np.dot(w[-1].T, xi))
            w_new = w[-1] + eta*(yi - zi)*xi
            count += 1
            # stopping criteria
            if count%check_w_after == 0:
                if np.linalg.norm(w_new - w[-check_w_after]) < tol:
                    return w
            w.append(w_new)
    return w

In [8]:
X_train.shape

(8, 350)

In [9]:
y_train.shape

(350,)

In [10]:
eta = 0.05
d = X_train.shape[0]
w_init = np.random.randn(d, 1) 

In [11]:
w_final = logistic_sigmoid_regression(X_train, y_train, w_init, eta)

print(w_final[-1])

[[ 1.06746067]
 [ 0.09462792]
 [ 0.10450343]
 [ 0.06238173]
 [-0.02633136]
 [ 0.11683711]
 [ 0.34401317]
 [ 0.04964333]]


In [12]:
def predict(weights, X_test):
    y_prob = sigmoid(np.dot(weights.T, X_test))  
    return (y_prob >= 0.75).astype(int)  


In [13]:
y_pred = predict(w_final[-1], X_test)
y_pred

array([[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
        1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0,
        1, 1, 1, 1, 1, 1]])

In [14]:
y_pred_n = pd.Series(y_pred[0])

In [15]:
y_pred_n

0     0
1     1
2     0
3     0
4     0
5     0
6     1
7     0
8     0
9     0
10    1
11    1
12    1
13    0
14    1
15    1
16    1
17    0
18    0
19    0
20    0
21    1
22    1
23    1
24    0
25    0
26    0
27    0
28    0
29    0
30    1
31    1
32    1
33    0
34    1
35    1
36    0
37    0
38    0
39    1
40    0
41    0
42    1
43    0
44    1
45    1
46    1
47    1
48    1
49    1
dtype: int64

In [16]:
y_test_n = pd.Series(y_test).apply(lambda x: 1 if x >= 0.75 else 0)

In [17]:
y_test_n

0     0
1     0
2     0
3     0
4     0
5     0
6     1
7     0
8     0
9     1
10    1
11    1
12    1
13    0
14    1
15    1
16    0
17    0
18    0
19    0
20    0
21    1
22    1
23    1
24    0
25    0
26    0
27    0
28    0
29    0
30    1
31    0
32    1
33    0
34    1
35    1
36    0
37    0
38    0
39    1
40    0
41    0
42    1
43    1
44    1
45    1
46    1
47    1
48    0
49    1
dtype: int64

In [18]:
y_test

array([0.74, 0.73, 0.64, 0.63, 0.59, 0.73, 0.79, 0.68, 0.7 , 0.81, 0.85,
       0.93, 0.91, 0.69, 0.77, 0.86, 0.74, 0.57, 0.51, 0.67, 0.72, 0.89,
       0.95, 0.79, 0.39, 0.38, 0.34, 0.47, 0.56, 0.71, 0.78, 0.73, 0.82,
       0.62, 0.96, 0.96, 0.46, 0.53, 0.49, 0.76, 0.64, 0.71, 0.84, 0.77,
       0.89, 0.82, 0.84, 0.91, 0.67, 0.95])

In [19]:
print(accuracy_score(y_test_n, y_pred_n))

0.88


### Using sklearn

In [20]:
data.head()

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [21]:
X_lib = data.drop(["Serial No.", "Chance of Admit"], axis='columns')
y_lib = data["Chance of Admit"]

In [22]:
y_lib = y_lib.apply(lambda x: 1 if x >= 0.75 else 0)

In [23]:
X_train_lib = X_lib[:350]
y_train_lib = y_lib[:350]

X_test_lib = X_lib[350:]
y_test_lib = y_lib[350:]

In [24]:
logR = LogisticRegression()
logR.fit(X_train_lib, y_train_lib)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
y_pred_logR = logR.predict(X_test_lib)
y_pred_logR

array([0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1])

In [26]:
print(accuracy_score(y_test_lib, y_pred_logR))

0.88


**b, Dự đoán khả năng bằng hồi quy tuyến tính**

In [31]:
from sklearn.linear_model import LinearRegression

linR = LinearRegression()
linR.fit(X_train_lib, y_train_lib)

In [28]:
y_pred_linR = linR.predict(X_test_lib)
y_pred_linR

array([ 0.40210911,  0.65392761,  0.17399705,  0.1024467 , -0.14750441,
        0.11678085,  0.62332641,  0.16766756, -0.07088742,  0.15225096,
        0.62988256,  0.9681033 ,  1.02351026,  0.11240566,  0.54849969,
        0.83547789,  0.45991561, -0.27455378, -0.24474133,  0.03259461,
        0.07763005,  0.70278745,  1.05595776,  0.483483  , -0.06240163,
       -0.13448527, -0.26447662, -0.31843511, -0.22201192,  0.20471689,
        0.58380324,  0.4983564 ,  0.8125356 ,  0.13168636,  1.11227099,
        1.20218854, -0.00793355,  0.06360949, -0.19492968,  0.49885134,
        0.06924584,  0.30258033,  0.79245315,  0.2885419 ,  0.86176859,
        0.67296188,  0.65604642,  0.99855837,  0.37797732,  1.05614619])

In [32]:
linR.score(X_test_lib, y_test_lib)

0.6108155157015693

In [None]:
print(mean_squared_error(y_test_lib, y_pred_linR))

0.09589505693113334


**c, Sử dụng Naive Bayes**

In [None]:
model_nb = GaussianNB()
model_nb.fit(X_train_lib, y_train_lib)

In [None]:
y_pred_nb = model_nb.predict(X_test_lib)


In [None]:
print("Accuracy", accuracy_score(y_test_lib, y_pred_nb))
print("Recall", recall_score(y_test_lib, y_pred_nb))
print("Precision", precision_score(y_test_lib, y_pred_nb))

Accuracy 0.9
Recall 0.8181818181818182
Precision 0.9473684210526315
