# **Bộ dữ liệu**

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix 
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse

In [3]:
data = pd.read_csv('B:\Lập\AI\SMSSpamCollection', sep='\t', names=['label', 'message'])

In [4]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [6]:
data['label'] = data.label.map({'ham':0, 'spam':1})
data.head()

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
X_train, X_test, y_train, y_test = train_test_split(data['message'], 
                                                    data['label'], 
                                                    test_size =0.2, 
                                                    random_state=1)


print('Number of rows in the total set: {}'.format(data.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4457
Number of rows in the test set: 1115


In [8]:
count_vector = CountVectorizer()

In [9]:
training_data = count_vector.fit_transform(X_train).toarray()

# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
vectorizer = feature_extraction.text.CountVectorizer()
testing_data = count_vector.transform(X_test).toarray()

In [10]:
frequency_matrix = pd.DataFrame(training_data, 
                                columns = count_vector.get_feature_names_out())
frequency_matrix.head()

Unnamed: 0,00,000,008704050406,0121,01223585236,01223585334,0125698789,02,0207,02072069400,...,zed,zeros,zhong,zindgi,zoe,zoom,zouk,zyada,èn,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
training_data

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
testing_data



array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# **Dùng thư viện để Training**

In [21]:
clf = LogisticRegression(random_state=0, max_iter=5000, tol=0.5).fit(training_data, y_train)

In [22]:
predictions = clf.predict(testing_data)
predictions

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [23]:
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))
print('\nConfusion Matrix :\n', confusion_matrix(y_test, predictions))


Accuracy score:  0.989237668161435
Precision score:  0.9927007299270073
Recall score:  0.9251700680272109
F1 score:  0.9577464788732395

Confusion Matrix :
 [[967   1]
 [ 11 136]]


# **Tự code hàm Training**

In [16]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def initialize_parameters(dim):
    w = np.zeros((dim, 1))
    b = 0
    return w, b

import numpy as np

def precision_recall_f1_score(y_true, y_pred):
    true_positives = np.sum(np.logical_and(y_true == 1, y_pred == 1))
    false_positives = np.sum(np.logical_and(y_true == 0, y_pred == 1))
    false_negatives = np.sum(np.logical_and(y_true == 1, y_pred == 0))

    precision = true_positives / (true_positives + false_positives + 1e-8)
    recall = true_positives / (true_positives + false_negatives + 1e-8)
    f1_score = 2 * (precision * recall) / (precision + recall + 1e-8)

    return precision, recall, f1_score

def propagate(w, b, X, Y):
    m = X.shape[1]
    
    # Forward propagation
    A = sigmoid(np.dot(w.T, X) + b)
    cost = -1/m * np.sum(Y * np.log(A) + (1 - Y) * np.log(1 - A))
    
    # Backward propagation
    dw = 1/m * np.dot(X, (A - Y).T)
    db = 1/m * np.sum(A - Y)
    
    grads = {"dw": dw, "db": db}
    
    return grads, cost

def optimize(w, b, X, Y, num_iterations, learning_rate):
    for i in range(num_iterations):
        grads, cost = propagate(w, b, X, Y)
        
        dw = grads["dw"]
        db = grads["db"]
        
        # Gradient descent
        w = w - learning_rate * dw
        b = b - learning_rate * db
        
        if i % 100 == 0:
            print(f"Cost after iteration {i}: {cost}")
    
    params = {"w": w, "b": b}
    return params

def predict(w, b, X):
    m = X.shape[1]
    Y_prediction = np.zeros((1, m))
    
    A = sigmoid(np.dot(w.T, X) + b)
    
    for i in range(m):
        Y_prediction[0, i] = 1 if A[0, i] > 0.5 else 0
    
    return Y_prediction

def model(X_train, Y_train, X_test, Y_test, num_iterations=5000, learning_rate=0.5):
    # Ensure X_train and X_test are transposed
    X_train = X_train.T
    X_test = X_test.T
    
    # Convert Y_train and Y_test to NumPy arrays and reshape
    Y_train = np.array(Y_train).reshape(1, -1)
    Y_test = np.array(Y_test).reshape(1, -1)
    
    # Initialize parameters
    w, b = initialize_parameters(X_train.shape[0])
    
    # Optimize parameters
    parameters = optimize(w, b, X_train, Y_train, num_iterations, learning_rate)
    
    # Retrieve parameters
    w = parameters["w"]
    b = parameters["b"]
    
    # Predictions
    Y_prediction_train = predict(w, b, X_train)
    Y_prediction_test = predict(w, b, X_test)
    
    # Calculate metrics
    accuracy_train = 100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100
    accuracy_test = 100 - np.mean(np.abs(Y_prediction_test - Y_test)) * 100

    precision_train, recall_train, f1_score_train = precision_recall_f1_score(Y_train.flatten(), Y_prediction_train.flatten())
    precision_test, recall_test, f1_score_test = precision_recall_f1_score(Y_test.flatten(), Y_prediction_test.flatten())

    # Print metrics
    print("Train accuracy: {} %".format(accuracy_train))
    print("Train precision: {:.2f}".format(precision_train))
    print("Train recall: {:.2f}".format(recall_train))
    print("Train F1-score: {:.2f}".format(f1_score_train))
    print("------------------------------")
    print("Test accuracy: {} %".format(accuracy_test))
    print("Test precision: {:.2f}".format(precision_test))
    print("Test recall: {:.2f}".format(recall_test))
    print("Test F1-score: {:.2f}".format(f1_score_test))

In [17]:
model(training_data, y_train, testing_data, y_test)

Cost after iteration 0: 0.6931471805599453
Cost after iteration 100: 0.14561400275743547
Cost after iteration 200: 0.10619759962767361
Cost after iteration 300: 0.0888114198310248
Cost after iteration 400: 0.07829782900607364
Cost after iteration 500: 0.0709458784868094
Cost after iteration 600: 0.06536013422843162
Cost after iteration 700: 0.06088484218968892
Cost after iteration 800: 0.057165926773360146
Cost after iteration 900: 0.053993047409559645
Cost after iteration 1000: 0.05123213908036559
Cost after iteration 1100: 0.048792957996956536
Cost after iteration 1200: 0.046612039356860244
Cost after iteration 1300: 0.044643124663440364
Cost after iteration 1400: 0.042851483112837874
Cost after iteration 1500: 0.04121038388643
Cost after iteration 1600: 0.03969881571636561
Cost after iteration 1700: 0.03829996126758499
Cost after iteration 1800: 0.03700014592417828
Cost after iteration 1900: 0.03578809480470008
Cost after iteration 2000: 0.034654395901228596
Cost after iteration 210