In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Sample dataset
data = {
    'text': [
        'I love machine learning',
        'Machine learning is amazing',
        'Natural language processing is a part of machine learning',
        'I enjoy reading about machine learning',
        'I hate spam emails',
        'Spam emails are annoying',
        'I receive a lot of spam emails',
        'How to avoid spam emails'
    ],
    'label': [1, 1, 1, 1, 0, 0, 0, 0]  # 1 for machine learning related, 0 for spam related
}

df = pd.DataFrame(data)


In [3]:
# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Transform the text data into a BoW representation
X = vectorizer.fit_transform(df['text'])

# Convert the BoW matrix to a dense array
X = X.toarray()

# Extract the labels
y = df['label']


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [5]:
# Initialize the Multinomial Naive Bayes classifier
clf = MultinomialNB()

# Train the classifier on the training data
clf.fit(X_train, y_train)


In [6]:
# Make predictions on the test data
y_pred = clf.predict(X_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy and classification report
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))


Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2



In [7]:
# Step 1: Install necessary libraries
!pip install numpy pandas nltk




ImportError: cannot import name 'ConditionalFreqDist' from 'nltk.util' (/usr/local/lib/python3.10/dist-packages/nltk/util.py)

In [18]:

# Step 2: Import libraries
import numpy as np
import nltk
import pandas as pd
from nltk import bigrams, FreqDist, ConditionalFreqDist
from collections import defaultdict
import random

# Step 3: Create a sample dataset
data = {
    'text': [
        'I love machine learning',
        'Machine learning is amazing',
        'Natural language processing is a part of machine learning',
        'I enjoy reading about machine learning',
        'I hate spam emails',
        'Spam emails are annoying',
        'I receive a lot of spam emails',
        'How to avoid spam emails'
    ],
    'label': [1, 1, 1, 1, 0, 0, 0, 0]  # 1 for machine learning related, 0 for spam related
}
df = pd.DataFrame(data)

# Step 4: Prepare the data for bigram model
# Concatenate all texts
all_text = ' '.join(df['text'])

# Tokenize the text
nltk.download('punkt')
tokens = nltk.word_tokenize(all_text.lower())

# Create bigrams
bigrams_list = list(bigrams(tokens))

# Build a frequency distribution of bigrams
cfd = ConditionalFreqDist(bigrams_list)

# Step 5: Generate text using the bigram model
def generate_sentence(cfd, start_word, num_words=15):
    word = start_word
    sentence = [word]
    for _ in range(num_words - 1):
        if word in cfd:
            next_word = random.choices(list(cfd[word].keys()), list(cfd[word].values()))[0]
            sentence.append(next_word)
            word = next_word
        else:
            break
    return ' '.join(sentence)

# Generate a sentence
start_word = 'machine'  # You can change this to any starting word
generated_sentence = generate_sentence(cfd, start_word)
print(generated_sentence)


machine learning is a part of spam emails are annoying i receive a part of


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
