# **NLP FOR TEXT CLASSIFICATION**

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

print("Path to dataset files:", path)

In [None]:
# ✅ Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import string
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

nltk.download('stopwords')

# ✅ Step 2: Load Dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# ✅ Step 3: Preprocess Text
def clean_text(text):
    text = text.lower()
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = text.split()
    stop_words = stopwords.words('english')
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['clean_message'] = df['message'].apply(clean_text)

# ✅ Step 4: Encode Labels
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# ✅ Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_message'], df['label_num'], test_size=0.2, random_state=42)

# ✅ Step 6: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ✅ Step 7: Train Model (Logistic Regression)
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# ✅ Step 8: Evaluate Model
y_pred = model.predict(X_test_vec)

print("📊 Classification Report:\n")
print(classification_report(y_test, y_pred))

conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ✅ Step 9: Try Predicting Custom SMS Messages
sample_sms = [
    "Congratulations! You've won a free ticket to Bahamas. Reply WIN to claim.",
    "Can we meet tomorrow at 5pm?",
    "You are selected for a cash prize of $1000. Click the link to claim."
]

sample_clean = [clean_text(msg) for msg in sample_sms]
sample_vec = vectorizer.transform(sample_clean)
preds = model.predict(sample_vec)

for msg, label in zip(sample_sms, preds):
    print(f"\n🔍 Message: {msg}")
    print(f"➡ Prediction: {'Spam' if label == 1 else 'Ham'}")

In [None]:
# ✅ Step 1: Install & Import Libraries
!pip install wordcloud --quiet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

nltk.download('stopwords')

# ✅ Step 2: Load SMS Spam Collection Dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])

# ✅ Step 3: Preprocessing Text
def clean_text(text):
    text = text.lower()
    text = ''.join(ch for ch in text if ch not in string.punctuation)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

df['clean_message'] = df['message'].apply(clean_text)
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# ✅ Step 4: Visualization - Class Distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=df, x='label', palette='Set2')
plt.title('Distribution of Ham vs Spam')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()

# ✅ Step 5: WordClouds for Spam and Ham
spam_words = ' '.join(df[df['label'] == 'spam']['clean_message'])
ham_words = ' '.join(df[df['label'] == 'ham']['clean_message'])

plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
spam_wc = WordCloud(width=500, height=400, background_color='white', colormap='Reds').generate(spam_words)
plt.imshow(spam_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Spam WordCloud')

plt.subplot(1, 2, 2)
ham_wc = WordCloud(width=500, height=400, background_color='white', colormap='Greens').generate(ham_words)
plt.imshow(ham_wc, interpolation='bilinear')
plt.axis('off')
plt.title('Ham WordCloud')

plt.tight_layout()
plt.show()

# ✅ Step 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['clean_message'], df['label_num'], test_size=0.2, random_state=42)

# ✅ Step 7: TF-IDF Vectorization
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# ✅ Step 8: Train a Logistic Regression Classifier
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# ✅ Step 9: Predict and Evaluate
y_pred = model.predict(X_test_vec)

print("\n📌 Text Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))

# ✅ Step 10: Confusion Matrix
plt.figure(figsize=(6, 4))
conf_mat = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Purples', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# ✅ Step 11: ROC Curve
y_proba = model.predict_proba(X_test_vec)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(6, 4))
plt.plot(fpr, tpr, label='ROC curve (AUC = %0.2f)' % roc_auc, color='darkorange')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.show()

# ✅ Step 12: Predict on New Messages
print("\n🔍 Predicting on Sample Messages:")

sample_sms = [
    "Congratulations! You've won a $1000 gift card. Claim now!",
    "Are you coming to the meeting today?",
    "Urgent: Your account is suspended. Click to reactivate."
]

sample_clean = [clean_text(msg) for msg in sample_sms]
sample_vec = vectorizer.transform(sample_clean)
preds = model.predict(sample_vec)

for msg, label in zip(sample_sms, preds):
    print(f"\nMessage: {msg}")
    print(f"➡ Prediction: {'Spam' if label == 1 else 'Ham'}")