## Multi-Page Document Classification

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Sample multi-page documents
documents = [
    'This is the first document. It is about NLP.',
    'This is the second document. It contains information about Machine Learning.'
]
labels = ['NLP', 'Machine Learning']

# Create and train the model
model = make_pipeline(CountVectorizer(), MultinomialNB())
model.fit(documents, labels)

# Predict category for a new document
new_doc = ['This document discusses NLP techniques.']
predicted_category = model.predict(new_doc)
print("Predicted Category:", predicted_category)


Predicted Category: ['NLP']


## Spam Email Detection

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Sample email data
spam_data = {
    'text': ['Free money!!!', 'Hi, how are you?', 'Get rich quick!', 'Meeting at 10 AM.'],
    'label': ['spam', 'ham', 'spam', 'ham']
}
df = pd.DataFrame(spam_data)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.25, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)

# Train spam classifier
model_spam = MultinomialNB()
model_spam.fit(X_train_vec, y_train)

# Predict
X_test_vec = vectorizer.transform(X_test)
pred_spam = model_spam.predict(X_test_vec)
print("Spam Prediction:", pred_spam)


Spam Prediction: ['spam']


## Topic Modeling

In [4]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data for topic modeling
texts = [
    'I love programming in Python.',
    'Python is great for data science.',
    'Data analysis is key in machine learning.',
    'Natural language processing is an exciting field.'
]

# Vectorization
vectorizer = CountVectorizer()
X_topic = vectorizer.fit_transform(texts)

# Topic modeling
lda = LatentDirichletAllocation(n_components=2, random_state=42)
lda.fit(X_topic)

# Transform documents to topic space
topics = lda.transform(X_topic)
print("Topics Distribution:\n", topics)


Topics Distribution:
 [[0.89048136 0.10951864]
 [0.91424315 0.08575685]
 [0.07777014 0.92222986]
 [0.06729811 0.93270189]]


## Content-Based Product Recommendations

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Sample product data
product_data = {
    'Product 1': 'NLP and machine learning tutorials',
    'Product 2': 'Python programming for beginners',
    'Product 3': 'Advanced data science techniques'
}

# Convert to DataFrame
product_df = pd.DataFrame(list(product_data.items()), columns=['Product', 'Description'])

# Create feature matrix
product_features = CountVectorizer().fit_transform(product_df['Description'])

# Compute similarities
similarity_matrix = cosine_similarity(product_features)

# Recommend products
product_index = 0  # Example: Product 1
recommended_indices = np.argsort(similarity_matrix[product_index])[:-3:-1]  # Top 2
recommended_products = product_df.iloc[recommended_indices]['Product'].values.tolist()
print("Recommended Products:", recommended_products)


Recommended Products: ['Product 1', 'Product 3']
