In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
pd.set_option("max_colwidth",100)


In [2]:
documents = ["Gangs of Wasseypur is a great movie." , "The success of a movie depends on the performance of the actors.",
"There are no new movie releasing this week."]
print(documents)

['Gangs of Wasseypur is a great movie.', 'The success of a movie depends on the performance of the actors.', 'There are no new movie releasing this week.']


In [3]:
def preprocess(document):
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    document = " ".join(words)
    return document

documents = [preprocess(document) for document in documents]
print(documents)

['gangs wasseypur great movie .', 'success movie depends performance actors .', 'new movie releasing week .']


In [4]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(documents)
print(bow_model)

  (0, 4)	1
  (0, 3)	1
  (0, 9)	1
  (0, 2)	1
  (1, 0)	1
  (1, 6)	1
  (1, 1)	1
  (1, 8)	1
  (1, 4)	1
  (2, 10)	1
  (2, 7)	1
  (2, 5)	1
  (2, 4)	1


In [5]:
print(bow_model.toarray())

[[0 0 1 1 1 0 0 0 0 1 0]
 [1 1 0 0 1 0 1 0 1 0 0]
 [0 0 0 0 1 1 0 1 0 0 1]]


In [6]:
print(bow_model.shape)
print(vectorizer.get_feature_names())

(3, 11)
['actors', 'depends', 'gangs', 'great', 'movie', 'new', 'performance', 'releasing', 'success', 'wasseypur', 'week']


In [7]:
import pandas as pd
spam = pd.read_csv(".\SMSSpamCollection.txt", sep="\t", names=["label", "message"])
spam.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [8]:
messages  = spam.message
print(messages)

0       Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...
1                                                                             Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3                                                         U dun say so early hor... U c already then say...
4                                             Nah I don't think he goes to usf, he lives around here though
5       FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...
6                             Even my brother is not like to speak with me. They treat me like aids patent.
7       As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...
8       WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...
9       Had your mobile 11 m

In [9]:
message = [message for message in messages]
print(messages)

0       Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there g...
1                                                                             Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive ...
3                                                         U dun say so early hor... U c already then say...
4                                             Nah I don't think he goes to usf, he lives around here though
5       FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for ...
6                             Even my brother is not like to speak with me. They treat me like aids patent.
7       As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your call...
8       WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To c...
9       Had your mobile 11 m

In [10]:
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)
print(bow_model.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [11]:
print(bow_model.shape)
print(vectorizer.get_feature_names)

(5572, 8713)
<bound method CountVectorizer.get_feature_names of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)>


In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
word_lemmatizer = WordNetLemmatizer()

In [15]:
def preprocess(document, stem=True):
    document = document.lower()
    words = word_tokenize(document)
    words = [word for word in words if word not in stopwords.words("english")]
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [word_lemmatizer.lemmatize(word, pos='v') for word in words]
    
    document = " ".join(words)
    return document


In [17]:
messages = [preprocess(message, stem=True) for message in spam.message]
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [18]:
pd.DataFrame(bow_model.toarray(), columns= vectorizer.get_feature_names())

Unnamed: 0,00,000,000pe,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
print(vectorizer.get_feature_names())

['00', '000', '000pe', '008704050406', '0089', '0121', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02073162414', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578', '06', '07', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '07123456789', '0721072', '07732584351', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07781482378', '07786200117', '077xxx', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '078498', '07880867867', '0789xxxxxxx', '07946746291', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '08', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '083', '0844', '08448350055', '08448714184', '0845', '08450542832', '08452810071', '08452810073', '08452810075over18', '0870', '08700435505150p', '0

In [20]:
messages = [preprocess(message, stem=False) for message in spam.message]
vectorizer = CountVectorizer()
bow_model = vectorizer.fit_transform(messages)

In [21]:
pd.DataFrame(bow_model.toarray(), columns= vectorizer.get_feature_names())

Unnamed: 0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,02,...,zhong,zindgi,zoe,zogtorius,zoom,zouk,zyada,èn,ú1,〨ud
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
print(vectorizer.get_feature_names())

['00', '000', '000pes', '008704050406', '0089', '0121', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02073162414', '02085076972', '021', '03', '04', '0430', '05', '050703', '0578', '06', '07', '07008009200', '07046744435', '07090201529', '07090298926', '07099833605', '07123456789', '0721072', '07732584351', '07734396839', '07742676969', '07753741225', '0776xxxxxxx', '07781482378', '07786200117', '077xxx', '078', '07801543489', '07808', '07808247860', '07808726822', '07815296484', '07821230901', '078498', '07880867867', '0789xxxxxxx', '07946746291', '0796xxxxxx', '07973788240', '07xxxxxxxxx', '08', '0800', '08000407165', '08000776320', '08000839402', '08000930705', '08000938767', '08001950382', '08002888812', '08002986030', '08002986906', '08002988890', '08006344447', '0808', '08081263000', '08081560665', '0825', '083', '0844', '08448350055', '08448714184', '0845', '08450542832', '08452810071', '08452810073', '08452810075over18', '0870', '08700435505150p', '