In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ashwi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
message= pd.read_csv('SMSSpamCollection.txt', sep='\t', names=['label', 'message'])

In [5]:
message

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
#data cleaning and pre-processing: like lower case words, apply stop words or lemmatizers

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps= PorterStemmer()



In [7]:
corpus= []
for i in range (0, len(message)):
    review= re.sub('[^a-zA-Z]', ' ', message['message'][i]) #to remove all unnecessary characters. it replaces all other special text with blank
    #replacing with blank is why ' ' is given in above code. the msg[msg] is the column name inside the table
    review= review.lower()
    review= review.split() #to create a list of words. ifwe dont do this then the whole sentence will be considered as a single word
  
    review =[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review= ' '.join(review) #to join the words back to a sentence. why ' '? because we want to join the words with a space
    corpus.append(review)

    #only stemming non-stop words!
    #stemming is the process of converting words to their base word or stem word
    # eg. riding-> ride, horses-> horse
    # we are doing this to reduce the number of words in the bag of words model
    # we are also removing the stop words
    # stop words are the words that are not useful in the model. eg. is, the, a, an, etc 



In [8]:
corpus[1:10]
#every message is now one index of the corpus list


['ok lar joke wif u oni',
 'free entri wkli comp win fa cup final tkt st may text fa receiv entri question std txt rate c appli',
 'u dun say earli hor u c alreadi say',
 'nah think goe usf live around though',
 'freemsg hey darl week word back like fun still tb ok xxx std chg send rcv',
 'even brother like speak treat like aid patent',
 'per request mell mell oru minnaminungint nurungu vettam set callertun caller press copi friend callertun',
 'winner valu network custom select receivea prize reward claim call claim code kl valid hour',
 'mobil month u r entitl updat latest colour mobil camera free call mobil updat co free']

In [9]:
corpus[0]

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

# Creating BOW model

CountVectorizer is a class in the sklearn.feature_extraction.text module of the scikit-learn library in Python. It is used to convert a collection of text documents to a matrix of token counts, also known as a Bag of Words (BoW) model

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
CVec= CountVectorizer(max_features=3000) #max_features is the number of most frequent words we want to keep
#this creates a matrix, as seen below in the last code block. we create cols where colnames are the top 3000 words. and the rows are the messages
#for binary BOW, enable binary=True


In [12]:
a=CVec.fit_transform(corpus).toarray() 

#why toarray? because we want to convert the sparse matrix to an array:
#<Compressed Sparse Row sparse matrix of dtype 'int64' with 41601 stored elements and shape (5572, 3000)>
a

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [13]:
a.shape #3000 rows becuz we kept 3000 most frequent words and 5572 columns becuz we have 5572 messages

(5572, 3000)

In [14]:
at= pd.DataFrame(a)
at

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2990,2991,2992,2993,2994,2995,2996,2997,2998,2999
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
#the colnames of the 3000 cols in the BOW matrix


feature_names = CVec.get_feature_names_out()

# Print the feature names
print(feature_names)

['aah' 'aathi' 'abi' ... 'yuo' 'yup' 'zed']


# Creating N-gram model

In [16]:
CVec.vocabulary_
#index and the cols names in the BOW matrix

{'go': np.int64(1035),
 'point': np.int64(1931),
 'crazi': np.int64(565),
 'avail': np.int64(169),
 'bugi': np.int64(352),
 'great': np.int64(1070),
 'world': np.int64(2937),
 'la': np.int64(1410),
 'buffet': np.int64(351),
 'cine': np.int64(472),
 'got': np.int64(1055),
 'wat': np.int64(2842),
 'ok': np.int64(1787),
 'lar': np.int64(1422),
 'joke': np.int64(1349),
 'wif': np.int64(2895),
 'oni': np.int64(1796),
 'free': np.int64(944),
 'entri': np.int64(766),
 'wkli': np.int64(2921),
 'comp': np.int64(520),
 'win': np.int64(2901),
 'fa': np.int64(831),
 'cup': np.int64(579),
 'final': np.int64(886),
 'tkt': np.int64(2618),
 'st': np.int64(2434),
 'may': np.int64(1565),
 'text': np.int64(2560),
 'receiv': np.int64(2094),
 'question': np.int64(2046),
 'std': np.int64(2445),
 'txt': np.int64(2700),
 'rate': np.int64(2071),
 'appli': np.int64(115),
 'dun': np.int64(701),
 'say': np.int64(2230),
 'earli': np.int64(707),
 'hor': np.int64(1181),
 'alreadi': np.int64(71),
 'nah': np.int64(168

In [30]:
Cvec2= CountVectorizer(max_features=100, ngram_range=(2,3)) #ngram_range is used to include the combination of 2 words (bigram)
#1= unigram, 2= bigram, 3= trigram

In [31]:
N= Cvec2.fit_transform(corpus).toarray()
N

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
Cvec2.vocabulary_ #the colnames of the 100 cols in the N-gram matrix

{'free entri': np.int64(31),
 'claim call': np.int64(17),
 'call claim': np.int64(3),
 'free call': np.int64(30),
 'chanc win': np.int64(16),
 'txt word': np.int64(91),
 'let know': np.int64(54),
 'go home': np.int64(34),
 'pleas call': np.int64(70),
 'lt gt': np.int64(60),
 'miss call': np.int64(62),
 'want go': np.int64(97),
 'like lt': np.int64(55),
 'like lt gt': np.int64(56),
 'sorri call': np.int64(83),
 'call later': np.int64(11),
 'sorri call later': np.int64(84),
 'ur award': np.int64(92),
 'hi hi': np.int64(46),
 'call custom': np.int64(4),
 'custom servic': np.int64(24),
 'cash prize': np.int64(15),
 'call custom servic': np.int64(5),
 'po box': np.int64(71),
 'tri contact': np.int64(89),
 'draw show': np.int64(28),
 'show prize': np.int64(81),
 'prize guarante': np.int64(75),
 'guarante call': np.int64(41),
 'valid hr': np.int64(95),
 'show prize guarante': np.int64(82),
 'prize guarante call': np.int64(76),
 'select receiv': np.int64(78),
 'privat account': np.int64(72),
 

## N-grams

### Advantages:

Captures Context: N-grams maintain the order of words, allowing for the analysis of phrases and capturing contextual meaning better than TF-IDF and BoW.

Flexible Granularity: By varying the 'n' in n-grams (e.g., unigrams, bigrams), users can tailor the model to capture different levels of detail in text.

### Disadvantages:
Increased Dimensionality: The use of n-grams can lead to a high-dimensional feature space, which may complicate analysis and require more computational resources.

Data Sparsity: With larger n-values, many n-grams may be rare or absent in the training data, leading to challenges in generalization

# Creating TF-IDF model

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [34]:
tf= TfidfVectorizer(max_features=1000)


In [35]:
X_tf= tf.fit_transform(corpus).toarray()
tf.vocabulary_
#index of cols is shown in the output

{'go': np.int64(339),
 'point': np.int64(654),
 'crazi': np.int64(174),
 'avail': np.int64(50),
 'great': np.int64(352),
 'world': np.int64(975),
 'la': np.int64(448),
 'got': np.int64(348),
 'wat': np.int64(936),
 'ok': np.int64(602),
 'lar': np.int64(453),
 'joke': np.int64(433),
 'wif': np.int64(957),
 'free': np.int64(309),
 'entri': np.int64(252),
 'wkli': np.int64(968),
 'comp': np.int64(149),
 'win': np.int64(960),
 'cup': np.int64(178),
 'final': np.int64(291),
 'st': np.int64(807),
 'may': np.int64(517),
 'text': np.int64(849),
 'receiv': np.int64(694),
 'question': np.int64(678),
 'std': np.int64(812),
 'txt': np.int64(897),
 'rate': np.int64(685),
 'appli': np.int64(38),
 'dun': np.int64(232),
 'say': np.int64(738),
 'earli': np.int64(234),
 'alreadi': np.int64(23),
 'nah': np.int64(568),
 'think': np.int64(855),
 'goe': np.int64(341),
 'usf': np.int64(916),
 'live': np.int64(484),
 'around': np.int64(42),
 'though': np.int64(859),
 'freemsg': np.int64(310),
 'hey': np.int64

In [38]:
X_tf= pd.DataFrame(X_tf)
X_tf

#we can see decimal values and not just binary 1 and 0. This helps in understanding the importance of the word in the message

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## TF- IDF :

### Advantages:
Identifies Important Terms: TF-IDF effectively highlights significant terms in a document by weighing their importance based on frequency and rarity across a corpus

Language-Independent: It works across different languages without needing language-specific adjustments

Scalable: Capable of handling large datasets, making it suitable for extensive text analysis applications

### Disadvantages:
No Context Understanding: TF-IDF does not capture the semantic meaning or context of words, treating them as independent entities.

Ignores Word Order: It fails to consider the sequence of words, which can lead to misinterpretation of phrases (e.g., "not friendly" vs. "friendly")


Sensitivity to Rare Terms: Very rare terms can receive misleadingly high importance scores, skewing results