In [2]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv("BagOfWordsData.csv", sep="\t", names=["label", "message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Text preprocessing
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
portStem = PorterStemmer()

# Cleaning data
corpus = []
for i in range(0, len(df)):
  data = re.sub('[^a-zA-Z]', ' ', df["message"][i])
  data = data.lower()
  data = data.split()
  data = [portStem.stem(word) for word in data if not word in stopwords.words("english")]
  corpus.append(" ".join(data))

In [None]:
# Bag of Words Implementation

cv = CountVectorizer(max_features=100, binary=True) 
X = cv.fit_transform(corpus).toarray()
cv.vocabulary_ # Gives the list of top 100 words(max_features)

{'go': np.int64(22),
 'great': np.int64(25),
 'got': np.int64(24),
 'wat': np.int64(90),
 'ok': np.int64(56),
 'free': np.int64(18),
 'win': np.int64(94),
 'text': np.int64(77),
 'txt': np.int64(85),
 'say': np.int64(67),
 'alreadi': np.int64(0),
 'think': np.int64(80),
 'hey': np.int64(28),
 'week': np.int64(92),
 'back': np.int64(3),
 'like': np.int64(38),
 'still': np.int64(73),
 'send': np.int64(69),
 'even': np.int64(15),
 'friend': np.int64(19),
 'prize': np.int64(62),
 'claim': np.int64(7),
 'call': np.int64(4),
 'mobil': np.int64(47),
 'co': np.int64(8),
 'home': np.int64(30),
 'want': np.int64(89),
 'today': np.int64(82),
 'cash': np.int64(6),
 'day': np.int64(12),
 'repli': np.int64(64),
 'www': np.int64(96),
 'right': np.int64(65),
 'thank': np.int64(78),
 'take': np.int64(75),
 'time': np.int64(81),
 'use': np.int64(87),
 'messag': np.int64(44),
 'oh': np.int64(55),
 'ye': np.int64(97),
 'make': np.int64(42),
 'way': np.int64(91),
 'feel': np.int64(16),
 'dont': np.int64(14

In [None]:
# NGrams
cv = CountVectorizer(max_features=200, binary=True, ngram_range=(1, 2))
X = cv.fit_transform(corpus).toarray() 

In [11]:
cv.vocabulary_

{'go': np.int64(51),
 'great': np.int64(55),
 'got': np.int64(54),
 'wat': np.int64(183),
 'ok': np.int64(117),
 'free': np.int64(47),
 'win': np.int64(189),
 'st': np.int64(153),
 'may': np.int64(94),
 'text': np.int64(161),
 'txt': np.int64(174),
 'dun': np.int64(37),
 'say': np.int64(138),
 'alreadi': np.int64(0),
 'think': np.int64(164),
 'live': np.int64(83),
 'around': np.int64(5),
 'hey': np.int64(65),
 'week': np.int64(186),
 'word': np.int64(191),
 'back': np.int64(9),
 'like': np.int64(81),
 'still': np.int64(155),
 'send': np.int64(140),
 'even': np.int64(40),
 'per': np.int64(120),
 'friend': np.int64(48),
 'custom': np.int64(31),
 'prize': np.int64(132),
 'claim': np.int64(22),
 'call': np.int64(14),
 'hour': np.int64(70),
 'mobil': np.int64(101),
 'month': np.int64(103),
 'co': np.int64(23),
 'gonna': np.int64(52),
 'home': np.int64(68),
 'soon': np.int64(150),
 'want': np.int64(182),
 'talk': np.int64(159),
 'tonight': np.int64(172),
 'today': np.int64(168),
 'cash': np.