# Build model without dataset

## This model is paragraph_based sentenec classifier

### Import libaries

In [1]:
import re
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

### Create a dummy dataset

In [2]:
data = pd.DataFrame({
    'text': [
        "This is not spam.",
        "It's a legitimate message.",
        "Check out this amazing offer! You won't believe it.",
        "Hello, how are you? Hope you're doing well.",
        "Win a free iPhone now! Claim your prize!",
        "Congratulations, you've won $1,000,000! Click to claim your winnings.",
        "This is not spam.",
        "Check out this amazing offer!",
        "Hello, how are you?",
        "Win a free iPhone now!",
        "Congratulations, you've won $1,000,000.",
        "you've won 1,000,000rs.",
        "Get a 50% discount on all products today!",
        "Limited-time offer: Buy one, get one free!",
        "Exclusive deal: Save $100 on your purchase.",
        "Hello, it's me! Are you there?",
        "Earn money from home with our easy work-from-home program!",
        "You're the lucky winner of a luxury vacation!",
        "Hurry! Limited stock available for our new product launch.",
        "Greetings, hope you're having a great day!",
        "Claim your prize now, don't miss out on this opportunity!",
        "Congratulations, you've been selected for a special offer!",
        "Guaranteed to lose weight fast with our new product!",
        "Dear customer, your account needs attention.",
        "Amazing deals on electronics! Shop now!",
        "You're pre-approved for a credit card with a $10,000 limit!",
        "Click here to unsubscribe from our mailing list.",
        "Important: Your account security has been compromised.",
        "Join our loyalty program and get exclusive discounts!",
    ],
    'label': [1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1]  # 0 for not spam, 1 for spam
})

### Preprocess the data with NLTK


In [3]:
# Preprocess the data
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Split the paragraph into sentences


In [4]:
paragraph = "My Grandfather smoked his whole life. I was about 10 years old when my mother said to him, 'If you ever want to see your grandchildren graduate, you have to stop immediately.'. Tears welled up in his eyes when he realized what exactly was at stake. He gave it up immediately. Three years later he died of lung cancer. It was really sad and destroyed me. My mother said to me- 'Don't ever smoke. Please don't put your family through what your Grandfather put us through. I agreed. At 28, I have never touched a cigarette. I must say, I feel a very slight sense of regret for never having done it, because your post gave me cancer anyway."

### Creat a sentence vectorizer object CountVectorizer


In [5]:
sentences = sent_tokenize(paragraph)

### Initialize a CountVectorizer


In [6]:
vectorizer = CountVectorizer()

### Fit and transform the data for training


In [7]:
X_train_vectorized = vectorizer.fit_transform(data['text'])
y_train = data['label']

### Train a Naive Bayes classifier


In [8]:
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)

### Initialize variables to store information about each sentence


In [9]:
sentence_info = []

### Define spam words


In [10]:
spam_words = [
    'free',
    'win',
    'prize',
    'million',
    'lottery',
    'guaranteed',
    'cash',
    'money',
    'credit',
    'loan',
    'viagra',
    'pharmacy',
    'mortgage',
    'discount',
    'offer',
    'debt',
    'investment',
    'earn',
    'income',
    'online',
    'limited time',
    'risk-free',
    'urgent',
    'click below',
    'buy direct',
    'no catch',
    'no cost',
    'no fees',
    'save big money',
    'order now',
    'apply now',
    'get it now',
    'call now',
    'meet singles',
    'meet hot singles',
    'singles near you',
    'meet sexy singles',
    'get out of debt',
    'insurance',
    'warranty',
    'obligation',
    'reverses aging',
    'hidden',
    'prizes',
    'promise',
    '100% satisfied',
    'money-back',
    'stop',
    'lose',
    'miracle',
    'mass email',
    'full refund',
    'no hidden',
    'investment',
    'additional income',
    'home-based',
    'dig up dirt',
    'double your',
    'earn extra',
    'extra cash',
    'expect to earn',
    'fast cash',
    'free access',
    'free investment',
    'free membership',
    'free offer',
    'free preview',
    'increase sales',
    'increase traffic',
    'internet marketing',
    'marketing solution',
    'multi-level marketing',
    'online biz opportunity',
    'remove',
    'search engine',
    'this won’t believe',
    'winner',
    'winning',
    'work from home',
    'you are a winner!',
    'your income',
    'your family'
]

### Create function to classify each sentence in the paragraph


In [11]:
for sentence in sentences:
    # Preprocess the sentence
    sentence_vectorized = vectorizer.transform([sentence])
    
    # Predict whether the sentence is spam or not
    prediction = clf.predict(sentence_vectorized)
    
    # Count words in the sentence
    words = nltk.word_tokenize(sentence)
    word_count = len(words)
    
    # Identify spam words in the sentence
    spam_words_in_sentence = [word.lower() for word in words if word.lower() in spam_words]
    spam_word_count = len(spam_words_in_sentence)
    
    # Define a prediction result
    prediction_result = "spam" if prediction[0] == 1 else "not spam"
    
    # Store information about the sentence
    sentence_info.append({
        'sentence': sentence,
        'word_count': word_count,
        'spam_word_count': spam_word_count,
        'prediction_result': prediction_result,
        'spam_words': spam_words_in_sentence,
    })

In [12]:
!pip install colorama



### Let's time to predict

In [13]:
# Define ANSI escape codes for text formatting
COLORS = {
    'spam': '\033[91m',  # Red
    'not spam': '\033[92m',  # Green
}

ITALIC = '\033[3m'  # Italic
RESET = '\033[0m'  # Reset text formatting

# Define emojis
EMOJIS = {
    'spam': '😞',  # Sad face emoji
    'not spam': '😃',  # Happy face emoji
}

In [14]:
# Iterate through the sentence_info list
for info in sentence_info:
    prediction_result = info['prediction_result']
    
    # Determine the text formatting and emoji based on the prediction result
    if prediction_result in COLORS:
        formatted_result = COLORS[prediction_result] + prediction_result.upper() + RESET
        emoji = EMOJIS[prediction_result]
    else:
        formatted_result = prediction_result.upper()
        emoji = ''
    
    # Print information with formatting and emoji
    print("Sentence:", info['sentence'])
    print("Word Count:", info['word_count'])
    print("Spam Word Count:", info['spam_word_count'])
    print("Prediction Result:", formatted_result + ' ' + emoji)
    
    if prediction_result == 'spam':
        spam_words = ', '.join(info['spam_words'])
        print("Spam Words:", ITALIC + spam_words + RESET)
    
    print("\n")

Sentence: My Grandfather smoked his whole life.
Word Count: 7
Spam Word Count: 0
Prediction Result: [91mSPAM[0m 😞
Spam Words: [3m[0m


Sentence: I was about 10 years old when my mother said to him, 'If you ever want to see your grandchildren graduate, you have to stop immediately.'.
Word Count: 31
Spam Word Count: 1
Prediction Result: [91mSPAM[0m 😞
Spam Words: [3mstop[0m


Sentence: Tears welled up in his eyes when he realized what exactly was at stake.
Word Count: 15
Spam Word Count: 0
Prediction Result: [91mSPAM[0m 😞
Spam Words: [3m[0m


Sentence: He gave it up immediately.
Word Count: 6
Spam Word Count: 0
Prediction Result: [92mNOT SPAM[0m 😃


Sentence: Three years later he died of lung cancer.
Word Count: 9
Spam Word Count: 0
Prediction Result: [91mSPAM[0m 😞
Spam Words: [3m[0m


Sentence: It was really sad and destroyed me.
Word Count: 8
Spam Word Count: 0
Prediction Result: [92mNOT SPAM[0m 😃


Sentence: My mother said to me- 'Don't ever smoke.
Word Count: 10
Sp

### Now save the model

In [15]:
import joblib

In [16]:
# Save the classifier and vectorizer to files
joblib.dump(clf, 'paragraph_naive_bayes_model.pkl')

['paragraph_naive_bayes_model.pkl']

In [17]:
joblib.dump(vectorizer, 'paragraph_count_vectorizer.pkl')

['paragraph_count_vectorizer.pkl']

In [18]:
# You can also save the spam_words list for reference, if needed
with open('paragraph_spam_words.txt', 'w') as file:
    for word in spam_words:
        file.write(word + '\n')

In [19]:
# Load the saved classifier and vectorizer
loaded_clf = joblib.load('paragraph_naive_bayes_model.pkl')

In [20]:
loaded_vectorizer = joblib.load('paragraph_count_vectorizer.pkl')

In [21]:
# Load the spam_words list if needed
with open('paragraph_spam_words.txt', 'r') as file:
    loaded_spam_words = [line.strip() for line in file]