## BOW Implementations

In [None]:
import pandas as pd
messages = pd.read_csv('file.csv' , sep='\t' , names=['label' , 'message'])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Data Cleaning and Preprocessing

In [15]:
import  re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91993/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\91993/nltk_data...


True

In [18]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Initialize Stemmer
ps = PorterStemmer()

# Initialize Lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
corpus = []

# Here we will not add special characters , lower down all the words and apply stopwords and do stemming 

for i in range(0 , len(messages)):
    # substitute the special characters with space 
    review = re.sub('[^a-zA-Z]' , ' ' , messages['message'][i])

    # lower down the reviews 
    review = review.lower()

    # now making it to list of words
    review = review.split()

    # now stemming the words and removing the stopwords
    # review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    
    # if want lemmatizer 
    review = [lemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]

    # now joining the words to make a sentence
    review = ' '.join(review)

    # appending it to corpus / paragraph
    corpus.append(review)

#### Applying Bag of Words

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer( max_features=2500 )


# Here CountVectorizer have some parameters 
'''
lowercase = True
preprocessor = None
tokenizer = None
stop_words = None
max_features = None
vocabulory = None
binary = False        -> this is to convert binary BOW
'''

'\nlowercase = True\npreprocessor = None\ntokenizer = None\nstop_words = None\nmax_features = None\nvocabulory = None\nbinary = False        -> this is to convert binary BOW\n'

In [21]:
X = cv.fit_transform(corpus).toarray()

In [22]:
X.shape

(5572, 2500)

In [23]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

##### Leaning a parameter of BOW i.e. N-grams 

- S1: The food is good .
- S2: The food is not good .

- vocabulary we obtain food not good

- so for s1: [ 1 0 1 ]
-    for s2: [ 1 1 1 ]
- This creats very low difference


##### if adding bigrams

- vocabulary we obtain -> [ food   not   good   food_good   food_not   not_good ]

- s1: [1 0 1 1 0 0 ]
- s2: [1 1 1 0 1 1 ]
- this will create high difference





- Similarly n grams used to make a clear distinguish between sentences .
- n_grams = (1 ,1 ) -> unigrams
          = (1 ,2 ) -> unigrams and bigrams
          = (1 ,3 ) -> unigrams , bigrams and trigrams




In [28]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=100 , binary=True)

In [29]:
X = cv.fit_transform(corpus).toarray()

In [30]:
cv.vocabulary_   # This will show u top 100 words 

{'go': np.int64(23),
 'great': np.int64(27),
 'got': np.int64(26),
 'wat': np.int64(90),
 'ok': np.int64(59),
 'free': np.int64(19),
 'win': np.int64(94),
 'text': np.int64(79),
 'txt': np.int64(86),
 'say': np.int64(70),
 'already': np.int64(0),
 'think': np.int64(82),
 'life': np.int64(39),
 'hey': np.int64(30),
 'week': np.int64(92),
 'back': np.int64(6),
 'like': np.int64(40),
 'still': np.int64(75),
 'send': np.int64(72),
 'friend': np.int64(20),
 'prize': np.int64(65),
 'claim': np.int64(10),
 'call': np.int64(7),
 'mobile': np.int64(50),
 'co': np.int64(11),
 'home': np.int64(32),
 'want': np.int64(89),
 'today': np.int64(84),
 'cash': np.int64(9),
 'day': np.int64(15),
 'reply': np.int64(67),
 'www': np.int64(96),
 'right': np.int64(68),
 'take': np.int64(77),
 'time': np.int64(83),
 'message': np.int64(47),
 'com': np.int64(12),
 'oh': np.int64(58),
 'yes': np.int64(99),
 'make': np.int64(45),
 'way': np.int64(91),
 'dont': np.int64(17),
 'miss': np.int64(49),
 'ur': np.int64(

In [35]:
cv = CountVectorizer(max_features=500 , binary=True  , ngram_range=(2,2))
X = cv.fit_transform(corpus).toarray()

In [36]:
cv.vocabulary_

{'free entry': np.int64(123),
 'rate apply': np.int64(335),
 'per request': np.int64(301),
 'claim call': np.int64(54),
 'call claim': np.int64(23),
 'claim code': np.int64(55),
 'entitled update': np.int64(110),
 'update latest': np.int64(438),
 'latest colour': np.int64(212),
 'free call': np.int64(121),
 'call mobile': np.int64(31),
 'mobile update': np.int64(260),
 'chance win': np.int64(52),
 'win cash': np.int64(483),
 'reply hl': np.int64(341),
 'hl info': np.int64(180),
 'txt word': np.int64(433),
 'dont miss': np.int64(99),
 'let know': np.int64(218),
 'feel like': np.int64(115),
 'tell anything': np.int64(404),
 'mobile charged': np.int64(256),
 'go home': np.int64(146),
 'anything lor': np.int64(3),
 'call reply': np.int64(35),
 'nokia mobile': np.int64(282),
 'mobile free': np.int64(257),
 'free camcorder': np.int64(122),
 'please call': np.int64(311),
 'delivery tomorrow': np.int64(93),
 'lt gt': np.int64(235),
 'missed call': np.int64(254),
 'want go': np.int64(470),
 'fi