In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('spam.csv',encoding='latin-1')

In [3]:
data = data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1)

In [4]:
data

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
data['v2'].duplicated().sum()

403

In [6]:
data=data.drop_duplicates()

In [7]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
from nltk import bigrams,trigrams
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [8]:
stop_words = set(stopwords.words('english'))
punctuations = set(string.punctuation)

In [9]:
def clean_text(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stopwords
    filtered_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return filtered_tokens

# Apply the cleaning function
data.loc[:, 'cleaned_v2'] = data['v2'].apply(clean_text)

In [10]:
# Group by class and calculate word frequencies
def get_top_words(texts, top_n=10):
    all_words = [word for text in texts for word in text]
    word_freq = Counter(all_words)
    return word_freq.most_common(top_n)

class_word_freq = data.groupby('v1')['cleaned_v2'].apply(lambda x: get_top_words(x))

# Print the top words for each class
for class_label, top_words in class_word_freq.items():
    print(f"Top words for {class_label}:")
    for word, freq in top_words:
        print(f"  {word}: {freq}")
    print()

Top words for ham:
  u: 883
  get: 293
  gt: 288
  lt: 287
  go: 240
  got: 236
  know: 225
  like: 221
  ok: 215
  good: 212

Top words for spam:
  call: 302
  free: 191
  txt: 130
  u: 119
  ur: 119
  mobile: 105
  text: 104
  stop: 104
  claim: 96
  reply: 96



In [11]:
def get_top_bigrams(texts, top_n=10):
    all_bigrams = [bigram for text in texts for bigram in bigrams(text)]
    bigram_freq = Counter(all_bigrams)
    return bigram_freq.most_common(top_n)

class_bigram_freq = data.groupby('v1')['cleaned_v2'].apply(lambda x: get_top_bigrams(x))

# Print the top bigrams for each class
for class_label, top_bigrams in class_bigram_freq.items():
    print(f"Top bigrams for {class_label}:")
    for bigram, freq in top_bigrams:
        print(f"  {bigram}: {freq}")
    print()

Top bigrams for ham:
  ('lt', 'gt'): 254
  ('gon', 'na'): 58
  ('let', 'know'): 39
  ('r', 'u'): 35
  ('u', 'r'): 31
  ('take', 'care'): 29
  ('u', 'wan'): 29
  ('wan', 'na'): 28
  ('good', 'morning'): 23
  ('u', 'get'): 22

Top bigrams for spam:
  ('please', 'call'): 40
  ('guaranteed', 'call'): 21
  ('call', 'landline'): 21
  ('prize', 'guaranteed'): 20
  ('po', 'box'): 20
  ('urgent', 'mobile'): 17
  ('send', 'stop'): 17
  ('call', 'claim'): 16
  ('selected', 'receive'): 16
  ('contact', 'u'): 16



In [12]:
def get_top_trigrams(texts, top_n=10):
    all_trigrams = [trigram for text in texts for trigram in trigrams(text)]
    trigram_freq = Counter(all_trigrams)
    return trigram_freq.most_common(top_n)

# Group by the class column and calculate trigram frequencies
class_trigram_freq = data.groupby('v1')['cleaned_v2'].apply(lambda x: get_top_trigrams(x))

# Print the top trigrams for each class
for class_label, top_trigrams in class_trigram_freq.items():
    print(f"Top trigrams for {class_label}:")
    for trigram, freq in top_trigrams:
        print(f"  {trigram}: {freq}")
    print()

Top trigrams for ham:
  ('lt', 'decimal', 'gt'): 18
  ('happy', 'new', 'year'): 16
  ('like', 'lt', 'gt'): 14
  ('lt', 'gt', 'lt'): 14
  ('gt', 'lt', 'gt'): 14
  ('lt', 'gt', 'min'): 11
  ('hi', 'hi', 'hi'): 11
  ('sorry', 'call', 'later'): 8
  ('lt', 'gt', 'mins'): 8
  ('wat', 'time', 'u'): 8

Top trigrams for spam:
  ('prize', 'guaranteed', 'call'): 19
  ('call', 'land', 'line'): 16
  ('private', 'account', 'statement'): 15
  ('call', 'identifier', 'code'): 14
  ('identifier', 'code', 'expires'): 13
  ('guaranteed', 'call', 'land'): 13
  ('draw', 'shows', 'prize'): 12
  ('shows', 'prize', 'guaranteed'): 12
  ('account', 'statement', 'shows'): 12
  ('camcorder', 'reply', 'call'): 12



In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [14]:
data

Unnamed: 0,v1,v2,cleaned_v2
0,ham,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, think, goes, usf, lives, around, though]"
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,"[time, tried, contact, u, pound, prize, claim,..."
5568,ham,Will Ì_ b going to esplanade fr home?,"[b, going, esplanade, fr, home]"
5569,ham,"Pity, * was in mood for that. So...any other s...","[pity, mood, suggestions]"
5570,ham,The guy did some bitching but I acted like i'd...,"[guy, bitching, acted, like, interested, buyin..."


In [19]:
data['cleaned_v2'] = data['cleaned_v2'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the data
X_cbow = vectorizer.fit_transform(data['cleaned_v2'])

Unnamed: 0,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zoom,zouk,zyada,åð,åòharry,åòit,åômorrow,åôrents,ìä,ìï
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5164,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
y=data['v1']

In [18]:
from sklearn.linear_model import LogisticRegression

In [21]:
x_train,x_test,y_train,y_test=train_test_split(X_cbow,y)
lr=LogisticRegression()
lr.fit(x_train,y_train)

In [23]:
from sklearn.metrics import f1_score
y_pred = lr.predict(x_test)

f1_bow = f1_score(y_test, y_pred,pos_label='spam')

In [24]:
# Initialize CountVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_tfidf = vectorizer.fit_transform(data['cleaned_v2'])

In [25]:
x_train,x_test,y_train,y_test=train_test_split(X_tfidf,y)
lr=LogisticRegression()
lr.fit(x_train,y_train)

In [26]:
y_pred = lr.predict(x_test)

f1_tfidf = f1_score(y_test, y_pred,pos_label='spam')

In [29]:
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Remove emails
    tokens = [token for token in tokens if '@' not in token]
    
    # Remove URLs
    tokens = [token for token in tokens if not (token.startswith('http') or token.startswith('www'))]
    
    # Remove HTML tags
    tokens = [token for token in tokens if not (token.startswith('<') and token.endswith('>'))]
    
    # Remove numbers
    tokens = [token for token in tokens if not token.isdigit()]
    
    # Remove punctuation
    tokens = [token.translate(str.maketrans('', '', string.punctuation)) for token in tokens]
    tokens = [token for token in tokens if token]  
    
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    return ' '.join(tokens)


In [30]:
x_p = [preprocess_text(text) for text in data['cleaned_v2']]

In [33]:
# Initialize CountVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the data
X_tfidf_p = vectorizer.fit_transform(x_p)

In [34]:
x_train,x_test,y_train,y_test=train_test_split(X_tfidf_p,y)
lr=LogisticRegression()
lr.fit(x_train,y_train)

In [35]:
y_pred = lr.predict(x_test)

f1_tfidf_p = f1_score(y_test, y_pred,pos_label='spam')

In [36]:
f1_scores=[f1_bow,f1_tfidf,f1_tfidf_p]

In [37]:
f1_scores

[0.8933333333333333, 0.7662835249042146, 0.752851711026616]