In [4]:
import numpy as np
import pandas as pd


In [5]:
df=pd.read_csv('SMSSpamCollection',sep='\t')
df.columns=['Labels','Messages']
df['Labels']=df['Labels'].replace('ham','not spam')

In [6]:
df.head()

Unnamed: 0,Labels,Messages
0,not spam,Ok lar... Joking wif u oni...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,not spam,U dun say so early hor... U c already then say...
3,not spam,"Nah I don't think he goes to usf, he lives aro..."
4,spam,FreeMsg Hey there darling it's been 3 week's n...


In [7]:
import re #replit library  # simplify the language
import nltk  # will remove all words which are not helpful like a an the
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer # stemming the word which is helpful Ex loved===love means good
corpus=[] # will upload all clean msg after removing stopwords and punctuattions


for i in range(len(df)):
  msg=re.sub('[^a-zA-Z]',' ',df['Messages'][i])  # ^ means not  like anything which is not small a to z and capital A to Z by space
  msg=msg.lower() # lower all letters
  msg=msg.split() # split all words
  ps=PorterStemmer()
  all_stopwords=stopwords.words('english')

  msg=[ps.stem(word) for word in msg if not word in set(all_stopwords)]
  msg=' '.join(msg)
  corpus.append(msg)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\garga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000)
X=cv.fit_transform(corpus).toarray()
y=df['Labels'].values


In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier=LogisticRegression()
classifier.fit(X_train,y_train)

In [18]:
y_pred=classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred),1),y_test.reshape(len(y_test),1)),1))

[['not spam' 'not spam']
 ['not spam' 'not spam']
 ['not spam' 'not spam']
 ...
 ['not spam' 'not spam']
 ['not spam' 'not spam']
 ['not spam' 'not spam']]


In [19]:

from sklearn.metrics import confusion_matrix,accuracy_score ,precision_score
cm=confusion_matrix(y_test,y_pred)
print(cm)
accuracy_score=accuracy_score(y_test,y_pred)
print('The accuracy of the model is',accuracy_score*100,'%')
precision_score=precision_score(y_test,y_pred,pos_label='spam')
print('The precision of the model is',precision_score*100,'%')




[[941   2]
 [ 17 155]]
The accuracy of the model is 98.29596412556054 %
The precision of the model is 98.72611464968153 %


In [14]:
import os.path
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build


SCOPES = ['https://www.googleapis.com/auth/gmail.modify']

def authenticate_gmail():
    creds = None
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    service = build('gmail', 'v1', credentials=creds)
    return service


service = authenticate_gmail()


In [16]:
def check_inbox_for_spam(service, model, vectorizer, max_results=5):
    
    results = service.users().messages().list(userId='me', labelIds=['INBOX'], maxResults=max_results).execute()
    messages = results.get('messages', [])
    
    import base64
    ps = PorterStemmer()
    all_stopwords = set(stopwords.words('english'))
    


    for i, msg in enumerate(messages):
        msg_id=msg['id']
        msg_content = service.users().messages().get(userId='me', id=msg['id'], format='full').execute()

        
        headers = msg_content['payload'].get('headers', [])
        subject = next((h['value'] for h in headers if h['name'] == 'Subject'), 'No Subject')

        
        parts = msg_content['payload'].get('parts', [])
        body = ""
        if parts:
            for part in parts:
                if part['mimeType'] == 'text/plain':
                    body = part['body'].get('data', '')
                    break
        else:
            body = msg_content['payload']['body'].get('data', '')

        try:
            decoded_body = base64.urlsafe_b64decode(body).decode('utf-8')
        except:
            decoded_body = ""

        
        clean = re.sub('[^a-zA-Z]', ' ', decoded_body)
        clean = clean.lower().split()
        clean = [ps.stem(word) for word in clean if word not in all_stopwords]
        cleaned_text = ' '.join(clean)

        
        vector = vectorizer.transform([cleaned_text]).toarray()

        
        prediction = model.predict(vector)[0]
        label = ' NOT SPAM' if prediction == 'not spam' else ' SPAM'
        print(f"\n📩 Email #{i+1}")
        print(f"Subject: {subject}")
        print(f"Prediction: {label}")
        if prediction == 'spam':
            service.users().messages().modify(
                userId='me',
                id=msg_id,
                body={
                    'addLabelIds': ['SPAM'],
                    'removeLabelIds': ['INBOX']
                }
            ).execute()
            print("📤 Moved to Spam in Gmail ✅")
check_inbox_for_spam(service,classifier,cv)



📩 Email #1
Subject: 
Prediction:  NOT SPAM

📩 Email #2
Subject: 578456 is your verification code
Prediction:  NOT SPAM

📩 Email #3
Subject: Help strengthen the security of your Google Account
Prediction:  SPAM
📤 Moved to Spam in Gmail ✅

📩 Email #4
Subject: Get your first Daily Scratch Card, Anyahi
Prediction:  NOT SPAM

📩 Email #5
Subject: You’ve got new challenges, Anyahi!
Prediction:  NOT SPAM
