# <b><u>PREPROCESSING II: SENTENCE TO VECTOR and WORD TO VECTOR</u></b>

In [1]:
import nltk
import sklearn
import copy
import re
import pandas as pd

pd.set_option('display.max_colwidth', None)

In [2]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def processing1(sentence):
    sentence = re.sub('[^a-zA-Z0-9\s]', '', sentence) # remove punctuations and any kind of symbols
    sentence = sentence.lower() # convert everything to same case to avoid redundancy due to mixed cases
    sentence = sentence.split() # list of words
    sentence = [word for word in sentence if word not in set(nltk.corpus.stopwords.words('english')) ] # remove unimportant stopwords
    sentence = [lemmatizer.lemmatize(word) for word in sentence ] # lemmatize to base words
    sentence = ' '.join(sentence) # back to sentence from processed list of words
    return sentence


In [3]:
paragraph = '''
I have three visions for India. 
In 3000 years of our history, people from all over the world have come and invaded us, captured our lands, conquered our minds. 
From Alexander onwards, the Greeks, the Turks, the Moguls, the Portuguese, the British, the French, the Dutch, all of them came and looted us, took over what was ours. 
Yet we have not done this to any other nation. We have not conquered anyone. 
We have not grabbed their land, their culture, their history and tried to enforce our way of life on them.
Why? Because we respect the freedom of others.
'''

## <b><u>BAG OF WORDS (sentence to vector)</u></b>

In [4]:
# It is like one-hot representation, where each unique word occurring in the entire paragraph/text (set of sentences) is a feature
# Basically; one hot encoding where words are the features

# Extract all sentences from paragraph. Each sentence will be turned into vector having some specific features (unique words).
sentences = nltk.sent_tokenize(paragraph)

# Processing 1
sentences = [processing1(sentence) for sentence in sentences]

# Converting sentences to vectors using Bag Of Words strategy
cv = sklearn.feature_extraction.text.CountVectorizer()
vectors = cv.fit_transform(sentences).toarray().tolist()

df = pd.DataFrame()
df["Preprocessed Sentence"] = sentences
df["Bag of words Vector Representation"] = vectors

df

Unnamed: 0,Preprocessed Sentence,Bag of words Vector Representation
0,three vision india,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]"
1,3000 year history people world come invaded u captured land conquered mind,"[1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0]"
2,alexander onwards greek turk mogul portuguese british french dutch came looted u took,"[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0]"
3,yet done nation,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,conquered anyone,"[0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
5,grabbed land culture history tried enforce way life,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0]"
6,,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
7,respect freedom others,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
