In [2]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
df = pd.read_csv("BagOfWordsData.csv", sep="\t", names=["label", "message"])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# Text preprocessing
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\anand\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
wnl = WordNetLemmatizer()

# Cleaning data
corpus = []
for i in range(0, len(df)):
  data = re.sub('[^a-zA-Z]', ' ', df["message"][i])
  data = data.lower()
  data = data.split()
  data = [wnl.lemmatize(word) for word in data if not word in stopwords.words("english")]
  corpus.append(" ".join(data))

In [7]:
# Bag of Words Implementation
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=100) 
X = tfidf.fit_transform(corpus).toarray()
tfidf.vocabulary_ # Gives the list of top 100 words(max_features)

{'go': np.int64(23),
 'great': np.int64(27),
 'got': np.int64(26),
 'wat': np.int64(89),
 'ok': np.int64(58),
 'free': np.int64(19),
 'win': np.int64(93),
 'text': np.int64(77),
 'txt': np.int64(85),
 'say': np.int64(68),
 'already': np.int64(0),
 'think': np.int64(80),
 'life': np.int64(39),
 'hey': np.int64(30),
 'week': np.int64(91),
 'back': np.int64(6),
 'like': np.int64(40),
 'still': np.int64(73),
 'send': np.int64(70),
 'friend': np.int64(20),
 'prize': np.int64(63),
 'claim': np.int64(10),
 'call': np.int64(7),
 'mobile': np.int64(49),
 'co': np.int64(11),
 'home': np.int64(32),
 'want': np.int64(88),
 'today': np.int64(82),
 'cash': np.int64(9),
 'day': np.int64(15),
 'reply': np.int64(65),
 'www': np.int64(96),
 'right': np.int64(66),
 'take': np.int64(75),
 'time': np.int64(81),
 'message': np.int64(46),
 'com': np.int64(12),
 'oh': np.int64(57),
 'yes': np.int64(99),
 'make': np.int64(44),
 'way': np.int64(90),
 'dont': np.int64(17),
 'miss': np.int64(48),
 'ur': np.int64(