# Term Project - NLP Spam Filter - COMP 237

### Load the data

In [1]:
import pandas as pd

# Load the data into a Pandas DataFrame
df = pd.read_csv('Youtube01-Psy.csv')

# Display basic information about the DataFrame
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   COMMENT_ID  350 non-null    object
 1   AUTHOR      350 non-null    object
 2   DATE        350 non-null    object
 3   CONTENT     350 non-null    object
 4   CLASS       350 non-null    int64 
dtypes: int64(1), object(4)
memory usage: 13.8+ KB
None


### Prepare the Data for Model Building using NLTK


In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

# Select relevant columns
data = df[['CONTENT', 'CLASS']]

# Tokenize and vectorize the text using CountVectorizer
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(data['CONTENT'])

# Downscale the transformed data using TF-IDF
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

# Display highlights of the output
print("Shape after Count Vectorization:", X_counts.shape)
print("Shape after TF-IDF Transformation:", X_tfidf.shape)

Shape after Count Vectorization: (350, 1418)
Shape after TF-IDF Transformation: (350, 1418)


### Suffle and Split the data into train and test sets

In [3]:
# Shuffle the dataset
df_shuffled = df.sample(frac=1, random_state=42)

# Split the dataset into training (75%) and testing (25%)
train_size = int(0.75 * len(df_shuffled))
train_data, test_data = df_shuffled[:train_size], df_shuffled[train_size:]

# Separate class from features
X_train = train_data['CONTENT']
y_train = train_data['CLASS']
X_test = test_data['CONTENT']
y_test = test_data['CLASS']

### Fit Naive Bayes classifier and cross-validate

In [4]:
# Fit the training data into a Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(count_vectorizer.transform(X_train), y_train)

# Cross-validate the model on the training data
cv_scores = cross_val_score(classifier, count_vectorizer.transform(X_train), y_train, cv=5)
print("Mean Cross-Validation Accuracy:", cv_scores.mean())

Mean Cross-Validation Accuracy: 0.9539912917271408


### Test the model on test set and check the accuracy

In [5]:
# Test the model on the test data
y_pred = classifier.predict(count_vectorizer.transform(X_test))

# Print confusion matrix and accuracy
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy on Test Data:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[42  3]
 [ 1 42]]
Accuracy on Test Data: 0.9545454545454546


### Pass new comments to classifier and check whether it is spam or not

In [12]:
# Create new comments
new_comments = [
    "The way this video had a chokehold on me is crazy",
    "I am here to see the views",
    "I love the content on this channel.",
    "Check out my new music video!",
    "Get free iPhone by clicking the link: freeiphone.com",
    "Subscribe to my channel for great content!",
    "If you want to escape the matrix, check out my channel"
]

# Pass new comments to the classifier
new_comments_vectorized = count_vectorizer.transform(new_comments)
predictions = classifier.predict(new_comments_vectorized)

# Display results
results = pd.DataFrame({'Comment': new_comments, 'Prediction': predictions})
print("\nPredictions for New Comments:\n", results)


Predictions for New Comments:
                                              Comment  Prediction
0  The way this video had a chokehold on me is crazy           0
1                         I am here to see the views           0
2                I love the content on this channel.           0
3                      Check out my new music video!           1
4  Get free iPhone by clicking the link: freeipho...           1
5         Subscribe to my channel for great content!           1
6  If you want to escape the matrix, check out my...           1
