In [98]:
import pandas as pd
from mlxtend.frequent_patterns import fpgrowth, association_rules
from sklearn.feature_extraction.text import TfidfVectorizer
from mlxtend.preprocessing import TransactionEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

ps = PorterStemmer()
stop_words = set(stopwords.words('english'))
print(stop_words)

{'haven', 'during', 'with', 'other', 't', 'under', 'her', 'until', 'doing', "she's", 'above', 's', 'mightn', 'ours', 'now', 'is', 'having', 'do', "mightn't", "wouldn't", 'below', 'as', 'hadn', 'while', 'its', "needn't", 'which', 'have', 'before', 'won', 'between', 'those', 'more', 'didn', 'myself', 'needn', 'just', 'any', 'very', 'again', 'because', 'after', 'we', 'for', 'herself', "that'll", 'too', 'mustn', 'what', "shan't", 'they', "didn't", "you've", 'shouldn', 'he', 'about', "aren't", 'himself', 'it', 'd', 'his', 'against', "haven't", 'them', 'i', 'if', 'then', 'was', 'itself', 'me', 'weren', 'the', 'were', "you're", 'not', 'where', 'same', 'yours', "should've", 'whom', 'once', "doesn't", "it's", 'these', "won't", 'most', 'both', 'are', 'yourself', 'all', 'been', 'here', 'than', 'themselves', 'an', 'no', 've', 'who', 'to', "weren't", 'that', "hasn't", 'yourselves', 'does', "wasn't", 'from', 'can', 'ourselves', 'out', 'don', 'each', 'you', 'did', 'up', "don't", 're', "isn't", 'o', '

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [99]:
# Đọc dữ liệu từ URL
url = 'https://raw.githubusercontent.com/mohitgupta-1O1/Kaggle-SMS-Spam-Collection-Dataset-/master/spam.csv'
df = pd.read_csv(url, encoding='latin1')

df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [100]:
# Chỉ giữ lại cột tin nhắn
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

def clean_message(message):
     # Chuyển đổi tin nhắn thành chữ thường
    message = message.lower()
    # Loại bỏ các dấu không phải chữ cái
    message = re.sub(r'[^A-Za-z\s]', '', message)
    # Loại bỏ khoảng trắng dư thừa
    message = re.sub(r'\s+', ' ', message).strip()
    return message

df['message'] = df['message'].apply(clean_message)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message'] = df['message'].apply(clean_message)


Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [101]:
df = df.drop_duplicates(keep = 'first')

df

Unnamed: 0,label,message
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i dont think he goes to usf he lives aroun...
...,...,...
5567,spam,this is the nd time we have tried contact u u ...
5568,ham,will b going to esplanade fr home
5569,ham,pity was in mood for that soany other suggestions
5570,ham,the guy did some bitching but i acted like id ...


In [102]:
#Loại bỏ stop words
def remove_stopwords(message):
    words = message.split()
    return ' '.join([word for word in words if word.lower() not in stop_words])

df['message'] = df['message'].apply(remove_stopwords)

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['message'] = df['message'].apply(remove_stopwords)


Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though
...,...,...
5567,spam,nd time tried contact u u pound prize claim ea...
5568,ham,b going esplanade fr home
5569,ham,pity mood soany suggestions
5570,ham,guy bitching acted like id interested buying s...


In [103]:
values = df['label'].value_counts()
total = values.sum()

percentage_0 = (values[0] /total) * 100
percentage_1 = (values[1]/ total) *100

print('percentage of ham :' ,percentage_0)
print('percentage of spam:' ,percentage_1)

percentage of ham : 88.33235120801413
percentage of spam: 11.667648791985858


  percentage_0 = (values[0] /total) * 100
  percentage_1 = (values[1]/ total) *100


In [104]:
# Tách dữ liệu thành tập huấn luyện và tập kiểm tra
train_df, test_df = train_test_split(df, test_size=0.3, random_state=35)

# Sử dụng % dữ liệu huấn luyện
train_df_sampled = train_df.sample(frac=0.1, random_state=3)
# train_df_sampled = train_df

# Tách riêng các tin nhắn spam và ham trong tập huấn luyện mẫu
spam_transactions_sampled = train_df_sampled[train_df_sampled['label'] == 'spam']['message'].str.split().tolist()
ham_transactions_sampled = train_df_sampled[train_df_sampled['label'] == 'ham']['message'].str.split().tolist()

# spam_transactions_sampled
# ham_transactions_sampled

In [105]:
#Mã hóa các giao dịch spam
# te_spam = TransactionEncoder()
# te_ary_spam = te_spam.fit(spam_transactions_sampled).transform(spam_transactions_sampled)
# df_te_spam = pd.DataFrame(te_ary_spam, columns=te_spam.columns_)

# print(df_te_spam)

unique_items = sorted(set(item for transaction in spam_transactions_sampled for item in transaction))
binary_matrix = [[1 if item in transaction else 0 for item in unique_items] for transaction in spam_transactions_sampled]
df_te_spam = pd.DataFrame(binary_matrix, columns=unique_items)

# Áp dụng FP-Growth cho spam
frequent_itemsets_spam = fpgrowth(df_te_spam, min_support=0.05, use_colnames=True)

frequent_itemsets_spam



Unnamed: 0,support,itemsets
0,0.357143,(call)
1,0.333333,(free)
2,0.261905,(txt)
3,0.166667,(stop)
4,0.142857,(win)
5,0.142857,(text)
6,0.095238,(entry)
7,0.071429,(weekly)
8,0.119048,(phone)
9,0.166667,(ur)


In [109]:
# Mã hóa các giao dịch ham
# te_ham = TransactionEncoder()
# te_ary_ham = te_ham.fit(ham_transactions_sampled).transform(ham_transactions_sampled)
# df_te_ham = pd.DataFrame(te_ary_ham, columns=te_ham.columns_)

unique_items = sorted(set(item for transaction in ham_transactions_sampled for item in transaction))
binary_matrix = [[1 if item in transaction else 0 for item in unique_items] for transaction in ham_transactions_sampled]
df_te_ham = pd.DataFrame(binary_matrix, columns=unique_items)

# Áp dụng FP-Growth cho ham
frequent_itemsets_ham = fpgrowth(df_te_ham, min_support=0.02, use_colnames=True)

frequent_itemsets_ham



Unnamed: 0,support,itemsets
0,0.044586,(ltgt)
1,0.035032,(think)
2,0.057325,(love)
3,0.044586,(come)
4,0.035032,(night)
5,0.025478,(wait)
6,0.022293,(hi)
7,0.022293,(hope)
8,0.022293,(see)
9,0.022293,(want)


In [110]:
# Tạo tập luật kết hợp cho spam với min_threshold cao hơn
rules_spam = association_rules(frequent_itemsets_spam, metric="confidence", min_threshold=0.05)
# Tạo tập luật kết hợp cho ham với min_threshold cao hơn
rules_ham = association_rules(frequent_itemsets_ham, metric="confidence", min_threshold=0.05)

# print(rules_ham)
# print(rules_spam)

# Hàm phân loại tin nhắn
def classify_message(message, rules_spam, rules_ham):
    message_set = set(message.split())

    spam_score = 0
    ham_score = 0

    for _, rule in rules_spam.iterrows():
        if set(rule['antecedents']).issubset(message_set):
            spam_score += rule['confidence']

    for _, rule in rules_ham.iterrows():
        if set(rule['antecedents']).issubset(message_set):
            ham_score += rule['confidence']

    if spam_score > ham_score:
        return 'spam'
    else:
        return 'ham'

# Phân loại các tin nhắn trong tập kiểm tra
# test_df['message'] = test_df['message'].apply(remove_stopwords)
test_df['predicted'] = test_df['message'].apply(lambda x: classify_message(x, rules_spam, rules_ham))

# Tính độ chính xác
accuracy = accuracy_score(test_df['label'], test_df['predicted'])
print(f'Accuracy: {accuracy}')

print(classification_report(test_df['label'], test_df['predicted']))

Accuracy: 0.8736910994764397
              precision    recall  f1-score   support

         ham       0.97      0.88      0.92      1344
        spam       0.49      0.83      0.61       184

    accuracy                           0.87      1528
   macro avg       0.73      0.86      0.77      1528
weighted avg       0.92      0.87      0.89      1528



In [108]:
message = clean_message("Natalja (25/F) is inviting you to be her friend. Reply YES-440 or NO-440 See her: www.SMS.ac/u/nat27081980 STOP? Send STOP FRND to 62468")
message = remove_stopwords(message)
result = classify_message(message, rules_spam, rules_ham)
print(result)

spam
