In [None]:
url = 'https://raw.githubusercontent.com/AnanyaThyagarajan/Python-Projects/main/Movie%20Recommend/Tamil_movies_dataset.csv'
data_movies = pd.read_csv(url)


In [3]:
import requests
import tarfile
import os
import email
import email.policy
from email.parser import BytesParser
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle


def download__file(url, save_path):
    """Downloads a file from a URL to the specified local path."""
    response = requests.get(url)
    if response.status_code == 200:
        with open(save_path, 'wb') as file:
            file.write(response.content)
    else:
        print(f"Failed to download {url}")

def extract__tarfile(file_path, extract_path="."):
    """Extracts a tar.bz2 file to the specified directory."""
    if file_path.endswith(".tar.bz2"):
        with tarfile.open(file_path, "r:bz2") as file:
            file.extractall(path=extract_path)

def load_emails_from_directory(directory):
    """Loads all emails from the specified directory, parsing them for text content."""
    emails = []
    for filename in os.listdir(directory):
        filepath = os.path.join(directory, filename)
        if os.path.isfile(filepath):  # Check if the path is a file and not a directory
            with open(filepath, 'rb') as file:
                msg = email.parser.BytesParser(policy=email.policy.default).parse(file)
                email_body = msg.get_body(preferencelist=('plain', 'html')).get_content()
                emails.append(email_body)
    return emails

def load_sms_data(file_path):
    """Load SMS data from a plain text file."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split('\t')
            if len(parts) == 2:
                data.append(parts)
    return pd.DataFrame(data, columns=['label', 'message'])

In [4]:


# URLs of your GitHub-hosted files (replace these with the actual URLs)
urls = [
    'https://raw.githubusercontent.com/AnanyaThyagarajan/Spam-Detection/main/Dataset/20030228_spam_2.tar.bz2',
    'https://raw.githubusercontent.com/AnanyaThyagarajan/Spam-Detection/main/Dataset/20050311_spam_2.tar.bz2',
    'https://raw.githubusercontent.com/AnanyaThyagarajan/Spam-Detection/main/Dataset/20030228_easy_ham_2.tar.bz2',
    'https://raw.githubusercontent.com/AnanyaThyagarajan/Spam-Detection/main/Dataset/20030228_hard_ham.tar.bz2',
    'https://raw.githubusercontent.com/AnanyaThyagarajan/Spam-Detection/main/Dataset/SMSSpamCollection'
]
files = ['20030228_spam_2.tar.bz2', '20050311_spam_2.tar.bz2', '20021010_easy_ham.tar.bz2', '20021010_hard_ham.tar.bz2', 'sms_spam_collection.txt']
directories = ['20030228_spam_2', '20050311_spam_2', '20021010_easy_ham', '20021010_hard_ham']

# Download, extract and load data
data_frames = []
for url, file in zip(urls, files):
    download__file(url, file)
    if file.endswith(".tar.bz2"):
        directory = file.replace(".tar.bz2", "")
        extract__tarfile(file, directory)
        emails = load_emails_from_directory(directory)
        label = 1 if 'spam' in file else 0
        df = pd.DataFrame(emails, columns=['message'])
        df['label'] = label
        data_frames.append(df)
    elif file == 'sms_spam_collection.txt':
        sms_data = load_sms_data(file)
        sms_data['label'] = sms_data['label'].map({'ham': 0, 'spam': 1})
        data_frames.append(sms_data)

# Combine datasets
combined_data = pd.concat(data_frames, ignore_index=True)



In [5]:
combined_data.head(5)

Unnamed: 0,message,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [7]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(combined_data['message'], combined_data['label'], test_size=0.2, random_state=42)

# Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [8]:
# SVM Model
model = SVC(kernel='linear')
model.fit(X_train_tfidf, y_train)



In [9]:
# Predict and Evaluate
y_pred = model.predict(X_test_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.9820627802690582
Confusion Matrix:
 [[950   4]
 [ 16 145]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       954
           1       0.97      0.90      0.94       161

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [10]:
# Save model and vectorizer
pickle.dump(model, open('spam_svm_model.pkl', 'wb'))
pickle.dump(vectorizer, open('tfidf_vectorizer.pkl', 'wb'))