# Build model without dataset

## This model is single sentenec classifier

### Import libaries

In [1]:
import re
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Create a dummy dataset

In [2]:
data = pd.DataFrame({
    'text': [
        "This is not spam.",
        "It's a legitimate message.",
        "Check out this amazing offer! You won't believe it.",
        "Hello, how are you? Hope you're doing well.",
        "Win a free iPhone now! Claim your prize!",
        "Congratulations, you've won $1,000,000! Click to claim your winnings.",
        "This is not spam.",
        "Check out this amazing offer!",
        "Hello, how are you?",
        "Win a free iPhone now!",
        "Congratulations, you've won $1,000,000.",
        "you've won 1,000,000rs.",
        "Get a 50% discount on all products today!",
        "Limited-time offer: Buy one, get one free!",
        "Exclusive deal: Save $100 on your purchase.",
        "Hello, it's me! Are you there?",
        "Earn money from home with our easy work-from-home program!",
        "You're the lucky winner of a luxury vacation!",
        "Hurry! Limited stock available for our new product launch.",
        "Greetings, hope you're having a great day!",
        "Claim your prize now, don't miss out on this opportunity!",
        "Congratulations, you've been selected for a special offer!",
        "Guaranteed to lose weight fast with our new product!",
        "Dear customer, your account needs attention.",
        "Amazing deals on electronics! Shop now!",
        "You're pre-approved for a credit card with a $10,000 limit!",
        "Click here to unsubscribe from our mailing list.",
        "Important: Your account security has been compromised.",
        "Join our loyalty program and get exclusive discounts!",
    ],
    'label': [1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1]  # 0 for not spam, 1 for spam
})

### Preprocess the data with NLTK


In [3]:
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Error loading punkt: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


### Creat a vectorizer object CountVectorizer


In [4]:
vectorizer = CountVectorizer()

### Fit and transform the data for training


In [5]:
X_train_vectorized = vectorizer.fit_transform(data['text'])
y_train = data['label']

### Train a Naive Bayes classifier


In [6]:
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

### Define spam words


In [7]:
spam_words = [
    'free',
    'win',
    'prize',
    'million',
    'lottery',
    'guaranteed',
    'cash',
    'money',
    'credit',
    'loan',
    'viagra',
    'pharmacy',
    'mortgage',
    'discount',
    'offer',
    'debt',
    'investment',
    'earn',
    'income',
    'online',
    'limited time',
    'risk-free',
    'urgent',
    'click below',
    'buy direct',
    'no catch',
    'no cost',
    'no fees',
    'save big money',
    'order now',
    'apply now',
    'get it now',
    'call now',
    'meet singles',
    'meet hot singles',
    'singles near you',
    'meet sexy singles',
    'get out of debt',
    'insurance',
    'warranty',
    'obligation',
    'reverses aging',
    'hidden',
    'prizes',
    'promise',
    '100% satisfied',
    'money-back',
    'stop',
    'lose',
    'miracle',
    'mass email',
    'full refund',
    'no hidden',
    'investment',
    'additional income',
    'home-based',
    'dig up dirt',
    'double your',
    'earn extra',
    'extra cash',
    'expect to earn',
    'fast cash',
    'free access',
    'free investment',
    'free membership',
    'free offer',
    'free preview',
    'increase sales',
    'increase traffic',
    'internet marketing',
    'marketing solution',
    'multi-level marketing',
    'online biz opportunity',
    'remove',
    'search engine',
    'this won’t believe',
    'winner',
    'winning',
    'work from home',
    'you are a winner!',
    'your income',
    'your family'
]

### Create function to classify a single sentence


In [21]:
def classify_sentence(sentence):
    # Preprocess the sentence
    sentence_vectorized = vectorizer.transform([sentence])
    
    # Predict whether the sentence is spam or not
    prediction = clf.predict(sentence_vectorized)
    
    # Count words in the sentence
    words = nltk.word_tokenize(sentence)
    word_count = len(words)
    
    # Identify spam words in the sentence
    spam_words_in_sentence = [word.lower() for word in words if word.lower() in spam_words]
    spam_word_count = len(spam_words_in_sentence)
    
    # Define a prediction result based on the model's prediction
    if prediction[0] == 1:
        prediction_result = "spam"
    else:
        prediction_result = "not spam"
    
    # Create a result dictionary
    result = {
        'sentence': sentence,
        'word_count': word_count,
        'spam_word_count': spam_word_count,
        'prediction_result': prediction_result,
    }
    
    if prediction_result == 'spam':
        result['spam_words'] = spam_words_in_sentence
    
    return result

### Let's time to predict


In [22]:
input_sentence = "My name is Artour Babaevsky. I grow up in smal farm to have make potatos. Father say Artour, potato harvest is bad. Need you to have play professional Doto in Amerikanski for make money for head-scarf for babushka.I bring honor to komrade and babushka. Sorry for is not have English. Please no cyka pasta coperino pasterino liquidino throwerino."

### Classify the sentence


In [23]:
result = classify_sentence(input_sentence)

### Print information about the classified sentence


In [24]:
# Define ANSI escape codes for text formatting
RESET = "\x1b[0m"
BOLD = "\x1b[1m"
ITALIC = "\x1b[3m"
RED = "\x1b[31m"
GREEN = "\x1b[32m"

In [25]:
# Define emoji constants
SAD_EMOJI = "😢"
HAPPY_EMOJI = "😃"

In [26]:
# Print results with text formatting and emojis
if result is not None:
    print(BOLD + "Sentence:" + RESET, result.get('sentence'))
    print(BOLD + "Word Count:" + RESET, result.get('word_count'))
    print(BOLD + "Spam Word Count:" + RESET, result.get('spam_word_count'))

    if result['prediction_result'] == 'spam':
        print(BOLD + "Prediction Result:" + RESET, RED + ITALIC + result['prediction_result'] + RESET, SAD_EMOJI)
        if 'spam_words' in result:
            print(BOLD + "Spam Words:" + RESET, ', '.join([RED + word + RESET for word in result['spam_words']]))
    else:
        print(BOLD + "Prediction Result:" + RESET, GREEN + ITALIC + result['prediction_result'] + RESET, HAPPY_EMOJI)
else:
    print("No result available.")  # Handle the case where result is None

[1mSentence:[0m My name is Artour Babaevsky. I grow up in smal farm to have make potatos. Father say Artour, potato harvest is bad. Need you to have play professional Doto in Amerikanski for make money for head-scarf for babushka.I bring honor to komrade and babushka. Sorry for is not have English. Please no cyka pasta coperino pasterino liquidino throwerino.
[1mWord Count:[0m 65
[1mSpam Word Count:[0m 1
[1mPrediction Result:[0m [31m[3mspam[0m 😢
[1mSpam Words:[0m [31mmoney[0m


### Now save the model

In [14]:
import joblib

In [15]:
# Save the classifier and vectorizer to files
joblib.dump(clf, 'sentence_naive_bayes_model.pkl')

['sentence_naive_bayes_model.pkl']

In [16]:
joblib.dump(vectorizer, 'sentence_count_vectorizer.pkl')

['sentence_count_vectorizer.pkl']

In [17]:
# You can also save the spam_words list for reference, if needed
with open('sentence_spam_words.txt', 'w') as file:
    for word in spam_words:
        file.write(word + '\n')

In [18]:
# Load the saved classifier and vectorizer
loaded_clf = joblib.load('sentence_naive_bayes_model.pkl')

In [19]:
loaded_vectorizer = joblib.load('sentence_count_vectorizer.pkl')

In [20]:
# Load the spam_words list if needed
with open('sentence_spam_words.txt', 'r') as file:
    loaded_spam_words = [line.strip() for line in file]