# NATURAL LANGUAGE PROCESSING

## Assignment 3 - Vectorization

Group members : 

1. Bharath Vedartham (01FB16ECS439)
2. Aniket Kumar (01FB16ECS057)
3. Dinesh Sathyanarayanan (01FB16ECS114)

In [11]:
# Import necessary modules
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import unicodedata
from nltk.corpus import stopwords

In [12]:
# Pre-processing functions

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english') and word!='ie':
            new_words.append(word)
    return new_words

def lemmatize_words(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word)
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_stopwords(words.split())
    words = lemmatize_words(words)
    return " ".join(words)


In [13]:
# Load the dataset into pd dataframe
df = pd.read_csv('dataset.csv', usecols=['text', 'stars'])

In [14]:
# Pre-processing to serve purpose of binary classifcation
df[df.stars!=3]
df.loc[df.stars == 1, 'stars'] = 0
df.loc[df.stars == 5, 'stars'] = 1
df.loc[df.stars == 4, 'stars'] = 1
df.loc[df.stars == 2, 'stars'] = 0

In [15]:
# Equal number of samples for each class
first_part = df[df.stars == 0]
second_part = df[df.stars == 1]
second_part = second_part[0:len(first_part)]

# Concat to single frame
frames = [first_part, second_part]
result = pd.concat(frames)

# Case 1 : Baseline case - Linear classification using extracted features

In [16]:
from textacy import lexicon_methods
import textacy
from textblob import TextBlob
import numpy as np
import progressbar
textacy.lexicon_methods.download_depechemood(data_dir=None, force=False)

In [17]:
# Preprocess using textacy and get 8 emotion valence
# features[0] - valence
# features[1] - sentiment polarity

features = np.zeros((len(first_part)*2, 9))
label = np.zeros(len(first_part)*2)
allreviews = []
print("***Feature Extraction***")

***Feature Extraction***


In [18]:
bar = progressbar.ProgressBar(maxval=len(first_part)*2, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

bar.start()
for i, row in enumerate(result.iterrows()):
    bar.update(i+1)
    text=textacy.preprocess.remove_punct(row[1]['text'])
    text = textacy.preprocess.replace_numbers(text, replace_with='*NUMBER*')
    text = textacy.preprocess.replace_emails(text, replace_with='*EMAIL*')
    text = textacy.preprocess.replace_phone_numbers(text, replace_with='*PHONE*')
    text = textacy.preprocess.replace_urls(text, replace_with='*URL*')
    text=textacy.preprocess.replace_currency_symbols(text,replace_with='INR')
    text = textacy.preprocess_text(text,lowercase=True)
    allreviews.append(normalize(text))
    doc = textacy.Doc(text, lang='en_core_web_sm')
    ddict=lexicon_methods.emotional_valence(doc, threshold=0.0, dm_data_dir=None, dm_weighting='normfreq')    
    features[i][0] = ddict['AFRAID'];features[i][1] = ddict['AMUSED']
    features[i][2] = ddict['ANGRY'];features[i][3] = ddict['ANNOYED']
    features[i][4] = ddict['DONT_CARE'];features[i][5] = ddict['HAPPY']
    features[i][6] = ddict['INSPIRED'];features[i][7] = ddict['SAD']
    features[i][8] = TextBlob(text).sentiment.polarity
    label[i] = row[1]['stars']
bar.finish()




In [19]:
print(features.shape)

(3352, 9)


In [20]:
print(label.shape)

(3352,)


In [21]:
from sklearn.model_selection import train_test_split

In [22]:
# Train data - 67%   Test data - 33%
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.33, random_state=42)

In [23]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2245, 9) (2245,) (1107, 9) (1107,)


In [24]:
# SVM linear classification using sklearn 
from sklearn.svm import SVC
clf = SVC(gamma='auto', kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [25]:
# Classification accuracy on test data
print(clf.score(X_test, y_test))

0.7949412827461608


# Case 2 : Linear classification using sparse vector representation

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
# allreviews - list of text documents

vectorizer = TfidfVectorizer()
vectorizer.fit(allreviews)

# Print vocabulary
print(len(vectorizer.vocabulary_))
print(vectorizer.idf_)

15802
[8.42446357 7.73131639 7.1717006  ... 8.42446357 8.42446357 8.42446357]


In [28]:
# encode each review
vectors = []
for i in allreviews:
    vectors.append(vectorizer.transform([i]))

In [29]:
print(len(vectors))

3352


In [30]:
vect = []
for i in vectors:
    vect.append(i.toarray()[0])
print(len(vect))

3352


In [31]:
vect = np.array(vect)
print(vect.shape)

(3352, 15802)


In [32]:
# Train data - 67%   Test data - 33%
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vect, label, test_size=0.33, random_state=42)

In [33]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(2245, 15802) (2245,) (1107, 15802) (1107,)


In [34]:
# SVM linear classification using sklearn 
from sklearn.svm import SVC
clf = SVC(gamma='auto', kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [35]:
# Classification accuracy on test data
print(clf.score(X_test, y_test))

0.8699186991869918


# Case 3 : Dense vector strategy with SVD

In [36]:
from sklearn.decomposition import TruncatedSVD

In [37]:
svd = TruncatedSVD(n_components=100, n_iter=7, random_state=42)

In [38]:
result = svd.fit_transform(X_train)
# result is of shape (2245, 100)
# so dimensionality is reduced to 100 from 15802

In [39]:
result.shape

(2245, 100)

In [40]:
# SVM linear classification using sklearn 
from sklearn.svm import SVC
clf = SVC(gamma='auto', kernel='linear')
clf.fit(result, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [41]:
print(clf.score(svd.fit_transform(X_test), y_test))

0.6187895212285456


In [42]:
# Try with more dimensions
svd1 = TruncatedSVD(n_components=1000, random_state=42)

In [43]:
print(X_train.shape)
result = svd1.fit_transform(X_train)
# result is of shape (2245, 1000)
# so dimensionality is reduced to 1000 from 15802
print(result.shape)

(2245, 15802)
(2245, 1000)


In [44]:
# SVM linear classification using sklearn 
from sklearn.svm import SVC
clf = SVC(gamma='auto', kernel='linear')
clf.fit(result, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [45]:
print(X_test.shape)
print(X_train.shape)
print(svd1.fit_transform(X_test).shape)
print(clf.score(svd1.fit_transform(X_test), y_test))


(1107, 15802)
(2245, 15802)
(1107, 1000)
0.6196928635953026


# Case 4 : Non-linear classification using Neural Networks

In [46]:
import gensim
import progressbar
from gensim.models import Word2Vec
import numpy as np

In [47]:
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [48]:
word_vectors = model.wv

  """Entry point for launching an IPython kernel.


In [49]:
print(word_vectors['hello'].shape)

(300,)


In [50]:
bar = progressbar.ProgressBar(maxval=len(allreviews), \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])
bar.start()
emb = np.zeros([len(allreviews),300])
for i1,i in enumerate(allreviews):
    bar.update(i1+1)
    words = i.split()
    for j in words:
        if(j not in model.vocab):
            pass
        else:
            emb[i1] = np.sum([model.wv[j], emb[i1]], axis=0)
bar.finish()

  if sys.path[0] == '':


In [51]:
emb.shape

(3352, 300)

In [73]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(emb, label, test_size=0.33, random_state=42)
X_train.shape

(2245, 300)

In [53]:
# All keras modules for ANN
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Conv1D, GlobalAveragePooling1D, MaxPooling1D

In [54]:
# Create a sequential model for ANN classification (non-linear)

# Input dimensions - 300 (from google vectors)
model = Sequential()

# First layer (input layer)
model.add(Dense(64, input_dim=300, activation='relu'))

# Dropout after each layer for reducing overfitting
model.add(Dropout(0.5))

# Second layer (hidden)
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

# Output layer (1 neuron for binary classification)
model.add(Dense(1, activation='sigmoid'))

In [55]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 64)                19264     
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 23,489
Trainable params: 23,489
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.fit(X_train,y_train, epochs = 20, batch_size = 128)
loss_and_metrics = model.evaluate(X_test, y_test, batch_size=128)
print("\nAccuracy is {}".format(loss_and_metrics[1]*100))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Accuracy is 84.19150863559588


# Case 4 : Non-linear classification using CNN

In [74]:
X_train.shape

(2245, 300)

In [75]:
X_test.shape

(1107, 300)

In [76]:
X_train = X_train.reshape(2245,20,15,1)
X_test = X_test.reshape(1107,20,15,1)

In [81]:
from keras.models import Sequential
from keras.layers import Dense, Conv2D, Flatten
#create model
model = Sequential()
#add model layers
model.add(Conv2D(64, kernel_size=3, activation='relu', input_shape=(20,15,1)))
model.add(Conv2D(32, kernel_size=3, activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='softmax'))

In [82]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [83]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_7 (Conv2D)            (None, 18, 13, 64)        640       
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 16, 11, 32)        18464     
_________________________________________________________________
flatten_4 (Flatten)          (None, 5632)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 5633      
Total params: 24,737
Trainable params: 24,737
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=3)