<a href="https://colab.research.google.com/github/2Svenkatesh/Machine-learning/blob/main/Spam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from imblearn.over_sampling import SMOTE

In [9]:
# Load the imbalanced dataset from the 'spam.csv' file using the 'latin-1' encoding.
# Display a random sample of 3 rows to get an initial look at the data.
data = pd.read_csv(f'/content/spam.csv', encoding='latin-1')
data.sample(3)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
4338,ham,Just got outta class gonna go gym.,,,
4674,spam,"Hi babe its Chloe, how r u? I was smashed on s...",,,
680,ham,What is this 'hex' place you talk of? Explain!,,,


In [10]:
# Rename columns 'v1' to 'label' and 'v2' to 'message' for better clarity.
# Display the updated column names to confirm the changes.
data.rename(columns={'v1': 'label', 'v2': 'message'}, inplace=True)

data.columns

Index(['label', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [11]:
# Select only the 'label' and 'message' columns for further analysis.
# Update the dataset to include only these columns.
col_in_use = ['label', 'message']

data = data[col_in_use]

In [12]:
# Preprocess the data by mapping 'ham' to 0 and 'spam' to 1 in the 'label' column.
# This converts the categorical labels into numerical values for easier analysis.
data['label'] = data['label'].map({'ham': 0, 'spam': 1})

In [13]:
data.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,4825
1,747


In [14]:
# Split the data into training and testing sets.
# Use 99% of the data for training and 1% for testing, with shuffling to ensure randomness.
X_train, X_test, y_train, y_test = train_test_split(data['message'], data['label'], test_size=0.25, shuffle=True)

# Convert the text data to TF-IDF features.
# Fit the vectorizer on the training data and transform both training and testing data.
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [15]:
# Apply SMOTE (Synthetic Minority Over-sampling Technique) to the training data to address class imbalance.
# Use SMOTE with fewer neighbors (k_neighbors=3) to generate synthetic samples.
smote = SMOTE(random_state=42, k_neighbors=3)  # Reduce the number of neighbors
X_train_tfidf_res, y_train_res = smote.fit_resample(X_train_tfidf, y_train)

In [16]:
# Train the logistic regression model using the resampled training data.
model = LogisticRegression()
model.fit(X_train_tfidf_res, y_train_res)

# Predict the labels on the test set using the trained model.
y_pred = model.predict(X_test_tfidf)

In [17]:
# Evaluate the model's performance on the test set.
# Calculate the accuracy score, confusion matrix, and classification report.
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [18]:
print(f'Accuracy: {accuracy}')

# Print the confusion matrix to show the performance of the classification.
print('Confusion Matrix:')
print(conf_matrix)

# Print the classification report to provide detailed metrics for each class.
print('Classification Report:')
print(class_report)


Accuracy: 0.9849246231155779
Confusion Matrix:
[[1206    8]
 [  13  166]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1214
           1       0.95      0.93      0.94       179

    accuracy                           0.98      1393
   macro avg       0.97      0.96      0.97      1393
weighted avg       0.98      0.98      0.98      1393



In [19]:
# Function to classify new messages
def classify_message(message):
    message_tfidf = vectorizer.transform([message])
    prediction = model.predict(message_tfidf)
    return 'spam' if prediction[0] == 1 else 'ham'


# Example usage
new_message = "Hi Bro > Come to office!"
print(f'The message "{new_message}" is classified as {classify_message(new_message)}.')

new_message = "Great, You Won Lottery, Free Get Hurry!"
print(f'The message "{new_message}" is classified as {classify_message(new_message)}.')

The message "Hi Bro > Come to office!" is classified as ham.
The message "Great, You Won Lottery, Free Get Hurry!" is classified as spam.
