In [1]:
!pip install nltk scikit-learn





In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
# Make sure your dataset has 'text' and 'label' columns
df = pd.read_csv(r"C:\study materials\data preprocessing.csv")

# Display the first few rows of the dataset
print(df.head())

# Fill NaN values with empty string and ensure all entries are strings
df['text'] = df['text'].fillna('').astype(str)

# Text preprocessing function
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'^b\s+', '', text)
    text = text.lower()
    
    # Tokenization and lemmatization
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply the preprocessing function to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['processed_text'] = df['processed_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=42)

# Convert text data to numerical data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Save the vectorizer for later use
import pickle
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Display the shapes of the resulting matrices
print("Training set shape:", X_train_tfidf.shape)
print("Test set shape:", X_test_tfidf.shape)

# The processed data is now ready for model training


   label                                               text
0    1.0  ounce feather bowl hummingbird opec moment ala...
1    1.0  wulvob get your medircations online qnb ikud v...
2    0.0   computer connection from cnn com wednesday es...
3    1.0  university degree obtain a prosperous future m...
4    0.0  thanks for all your answers guys i know i shou...
Training set shape: (37, 588)
Test set shape: (10, 588)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aaaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aaaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline
import pickle

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
df = pd.read_csv(r"C:\Users\aaaro\Downloads\archive (4)\combined_data.csv")
# Display the first few rows of the dataset
print(df.head())

# Fill NaN values with empty string and ensure all entries are strings
df['text'] = df['text'].fillna('').astype(str)

# Text preprocessing function
def preprocess_text(text):
    # Remove special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\^[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    text = re.sub(r'^b\s+', '', text)
    text = text.lower()
    
    # Tokenization and lemmatization
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)

# Apply the preprocessing function to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['processed_text'] = df['processed_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))

# Check for missing values in the label column
missing_labels = df['label'].isna().sum()
print(f"Number of missing labels: {missing_labels}")

# Option 1: Drop rows with missing labels
df = df.dropna(subset=['label'])

# Option 2: Replace NaN labels with a default value (if applicable)
# df['label'] = df['label'].fillna('default_label')  # Replace 'default_label' with an appropriate value

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['processed_text'], df['label'], test_size=0.2, random_state=42)

# Feature extraction using TF-IDF and feature selection with chi2
k = 1000  # Number of features to select

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('chi2', SelectKBest(chi2, k=k))
])

# Fit the pipeline on the training data and transform both training and test sets
X_train_selected = pipeline.fit_transform(X_train, y_train)
X_test_selected = pipeline.transform(X_test)

# Save the pipeline for later use
with open('tfidf_chi2_pipeline.pkl', 'wb') as f:
    pickle.dump(pipeline, f)

# Display the shapes of the resulting matrices
print("Training set shape after feature selection:", X_train_selected.shape)
print("Test set shape after feature selection:", X_test_selected.shape)

# The processed data is now ready for model training


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aaaro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aaaro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   label                                               text
0      1  ounce feather bowl hummingbird opec moment ala...
1      1  wulvob get your medircations online qnb ikud v...
2      0   computer connection from cnn com wednesday es...
3      1  university degree obtain a prosperous future m...
4      0  thanks for all your answers guys i know i shou...
Number of missing labels: 0
Training set shape after feature selection: (66758, 1000)
Test set shape after feature selection: (16690, 1000)


In [4]:
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the model
model = LogisticRegression()
-
# Train the model on the selected features
model.fit(X_train_selected, y_train)

# Save the model for later use
with open('spam_classifier_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Evaluate the model on the test set
y_pred = model.predict(X_test_selected)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Generate classification report
report = classification_report(y_test, y_pred)
print("\nClassification Report:\n", report)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:\n", conf_matrix)


Model Accuracy: 97.33%

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97      7938
           1       0.97      0.98      0.97      8752

    accuracy                           0.97     16690
   macro avg       0.97      0.97      0.97     16690
weighted avg       0.97      0.97      0.97     16690


Confusion Matrix:
 [[7629  309]
 [ 136 8616]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Explanation:
X_new: This should be the new input data that you want to classify. In the example above, X_new is a list of strings representing new email texts.

Transform the new data: The loaded_pipeline.transform(X_new) step applies the same preprocessing steps to the new data as were applied during training.

Predict using the model: The loaded_model.predict(X_new_selected) step uses the trained model to predict the class (spam or not spam) for the new data.

In [9]:
import pickle

# Load the feature extraction pipeline
with open('tfidf_chi2_pipeline.pkl', 'rb') as file:
    loaded_pipeline = pickle.load(file)

# Load the trained model
with open('spam_classifier_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)
X_new = ["Congratulations! You've won a free ticket to the Bahamas!", 
         "Don't miss our summer sale on all electronics.",
         "This is a regular email without any spammy content.",
        "Your request for an Amazon SageMaker Studio Lab expired because you didn’t register your account within 7 days of approval"
]

# Use the loaded pipeline and model
X_new_selected = loaded_pipeline.transform(X_new)  # Apply to new data
y_new_pred = loaded_model.predict(X_new_selected)

# Output predictions
print(y_new_pred)


[0 1 1 1]
