# Text Classification 

- Email (Spam/Ham)

### Step 1: Import Necessary Libraries

In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

### Step 2: Load the Dataset

In [2]:
data = pd.read_csv("spam.csv")

### Step 3: Data Preprocessing

In [4]:
# Download and prepare stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Tokenization and text cleaning
data['Message'] = data['Message'].apply(lambda x: ' '.join(word.lower() for word in word_tokenize(x) if word.isalpha()))

# Stop words removal
data['Message'] = data['Message'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DEVARSHI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\DEVARSHI/nltk_data'
    - 'c:\\Users\\DEVARSHI\\AppData\\Local\\Programs\\Python\\Python310\\nltk_data'
    - 'c:\\Users\\DEVARSHI\\AppData\\Local\\Programs\\Python\\Python310\\share\\nltk_data'
    - 'c:\\Users\\DEVARSHI\\AppData\\Local\\Programs\\Python\\Python310\\lib\\nltk_data'
    - 'C:\\Users\\DEVARSHI\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


### Step 4: Feature Extraction

In [11]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['Message'])
X

<5572x8709 sparse matrix of type '<class 'numpy.float64'>'
	with 74098 stored elements in Compressed Sparse Row format>

### Step 5: Split the Data into Training and Testing Sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, data['Category'], test_size=0.2, random_state=42)

### Step 6: Build and Train the Model

In [7]:
model = MultinomialNB()
model.fit(X_train, y_train)

### Step 7: Model Evaluation

In [9]:
y_pred = model.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

### Step 8: Print the Results

In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.9650224215246637
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98       966
        spam       1.00      0.74      0.85       149

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.91      1115
weighted avg       0.97      0.97      0.96      1115

