# Spam Classification

In [1]:
import pandas as pd   #data manupilation and analysis 
from sklearn.feature_extraction.text import TfidfVectorizer  #raw data to matrix of term frequency-inverse doc frequency
from sklearn.model_selection import train_test_split   #split into train and test sets
from sklearn.naive_bayes import MultinomialNB   #data represented as word vector counts
from sklearn.pipeline import Pipeline  #single obj is used for train and test
from sklearn.metrics import accuracy_score  #to compute accuracy
import joblib  #to save and load the model after training
import re  #regular expression for matching str, data preprocessing

# Load the dataset into dataframe
df = pd.read_csv('dataset/spam_detection_dataset.csv', encoding='latin1')

df = df['column_name'].fillna(df['column_name'].mode()[0], inplace=True)

In [2]:
# Define cleaning functions
def remove_urls(text):
    return re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

def remove_non_ascii(text):
    # Remove non-ASCII characters
    return ''.join(char for char in text if ord(char) < 128)

def remove_digits(text):
    # Remove numeric digits
    return re.sub(r'\d+', '', text)

def remove_special_characters(text):
    # Remove special characters except whitespace
    return re.sub(r'[^\w\s]', '', text)

def normalize_case(text):
    # Normalize text to lowercase
    return text.lower()

def clean_text(text):
    # Remove URLs
    text = remove_urls(text)
    # Remove non-ASCII characters
    text = remove_non_ascii(text)
    # Remove numeric digits
    text = remove_digits(text)
    # Remove special characters except whitespace
    text = remove_special_characters(text)
    # Normalize case
    text = normalize_case(text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply cleaning functions to the 'comment' column
df['comment'] = df['comment'].apply(clean_text)

In [3]:
# Split dataset
X = df['comment']
y = df['spam']  # Assuming 'spam' is the column with binary labels (1 for spam, 0 for non-spam)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Define a function to train, evaluate models, and print the classification report
def train_and_evaluate_model(pipeline, model_name):
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} Accuracy: {accuracy}')
    
    # Print precision, recall, and F1-score
    print(f'{model_name} Classification Report:\n')
    print(classification_report(y_test, y_pred, target_names=['not spam', 'spam']))
    print('-'*60)

Model Accuracy: 0.9029503105590062


In [5]:
# 1. SVM Model
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('svc', SVC(kernel='linear'))  # SVM classifier with a linear kernel
])
train_and_evaluate_model(pipeline_svm, 'SVM')

['spam_detection_model.pkl']

In [6]:
# Load the model from the file
model = joblib.load('spam_detection_model.pkl')

# Create a DataFrame with dummy values
data = {
    'comment': [
        'ayas stupid overacting',
        'khushis episode coming',
        'popatlal randwa marega',
        'apne mujhe bataa ya nhi ki apki beti bhi hai',
        'kya dimagggg hai popatlal',
        'arre oo yeh kya matlb kuch bhi family show hai yaar mat dikhao rre baba aisa kuch bhi',
        'tarak bhai mehta anjali ben mehta are really good pair',
        'if there were no girl there were no one in the world because a girl give birth to everyone',
        'popotlal ki shadi hogi rcb jab jitegi',
        'son is son till wife daughter is daughter till life',
        'khusi episodes coming',
        'i like taraak mehta ka ooltah chansma',
        'polar bhai ki shaadi jald hi jai',
        'Loved this product, will buy again.',
        'Claim your free vacation today!',
        'The quality of this product is amazing.',
        'You have been selected for a special offer.',
        'nice show'
    ]
}

new_data = pd.DataFrame(data)


# Basic data cleaning on new data
new_data['comment'] = new_data['comment'].str.lower()  # Convert to lowercase
new_data['comment'] = new_data['comment'].str.replace(r'\d+', '', regex=True)  # Remove numbers
new_data['comment'] = new_data['comment'].str.replace(r'[^\w\s]', '', regex=True)  # Remove punctuation
new_data['comment'] = new_data['comment'].str.strip()  # Remove whitespace

# Predict using the loaded model
new_predictions = model.predict(new_data['comment'])

# Add predictions to the new data
new_data['spam_prediction'] = new_predictions

print(new_data.head(20))

                                              comment  spam_prediction
0                              ayas stupid overacting                0
1                              khushis episode coming                0
2                              popatlal randwa marega                0
3        apne mujhe bataa ya nhi ki apki beti bhi hai                0
4                           kya dimagggg hai popatlal                0
5   arre oo yeh kya matlb kuch bhi family show hai...                0
6   tarak bhai mehta anjali ben mehta are really g...                0
7   if there were no girl there were no one in the...                0
8               popotlal ki shadi hogi rcb jab jitegi                0
9   son is son till wife daughter is daughter till...                0
10                              khusi episodes coming                0
11              i like taraak mehta ka ooltah chansma                0
12                   polar bhai ki shaadi jald hi jai                0
13    