# Natural Language Processing using Artificial Neural Networks

> “In God we trust. All others must bring data.” – W. Edwards Deming, statistician

# Word Embeddings

### What?
Convert words to vectors in a high dimensional space. Each dimension denotes an aspect like gender, type of object / word.

### Why?
By converting words to vectors we build relations between words. More similar the words in a dimension, more closer their scores are.

### Example
_W(green) = (1.2, 0.98, 0.05, ...)_

_W(red) = (1.1, 0.2, 0.5, ...)_

Here the vector values of _green_ and _red_ are very similar in one dimension because they both are colours. The value for second dimension is very different because red might be depicting something negative in the training data while green is used for positiveness.

By vectorizing we are indirectly building different kind of relations between words.

In [None]:
from gensim.models import word2vec
from gensim.models.word2vec import Word2Vec
from keras.datasets import imdb

# Reading blog post from data directory

In [1]:
import os
import pickle

In [None]:
DATA_DIRECTORY = os.path.join('../data')
print DATA_DIRECTORY

In [None]:
male_posts = []
female_post = []

In [None]:
with open(os.path.join(DATA_DIRECTORY,"male_blog_list.txt"),"rb") as male_file:
    male_posts= pickle.load(male_file)
with open(os.path.join(DATA_DIRECTORY,"female_blog_list.txt"),"rb") as female_file:
    female_posts = pickle.load(female_file)

In [None]:
print len(female_posts)
print len(male_posts)

#### Remove stop words
```from nltk.corpus import stopwords
cachedStopWords = stopwords.words("english")```

In [None]:
filtered_male_posts = []
filtered_female_posts = []

for post_male in male_posts:
    if len(post_male) == 0:
        continue
    filtered_male_posts.append(post_male)

for post_female in female_posts:
    if len(post_female) == 0:
        continue
    filtered_female_posts.append(post_female)

In [None]:
print len(filtered_female_posts)
print len(filtered_male_posts)

In [None]:
filtered_female_posts = map(lambda x:unicode(x), filtered_female_posts)
filtered_male_posts = map(lambda x: unicode(x), filtered_male_posts)
posts = filtered_female_posts + filtered_male_posts    
type(posts[0])

In [None]:
print len(posts)
print type(posts[100])

## Word2Vec

In [None]:
w2v = Word2Vec(size=200, min_count=1)
w2v.build_vocab(map(lambda x: x.split(), posts[:100]), )

In [None]:
w2v.vocab

In [None]:
w2v.similarity('I', 'My')

In [None]:
print posts[5]
w2v.similarity('ring', 'husband')

## Doc2Vec

The same technique of word2vec is extrapolated to documents. Here, we do everything done in word2vec + we vectorize the documents too

In [None]:
import numpy as np

In [None]:
# 0 for male, 1 for female
concatenate_array = np.concatenate((np.zeros(len(filtered_male_posts)),np.ones(len(filtered_female_posts))))

In [None]:
len(concatenate_array)

#### Create cross validation data

In [None]:
from sklearn.cross_validation import train_test_split
x_train,x_test,male_female_train,male_female_test = train_test_split(np.concatenate((filtered_male_posts,filtered_female_posts)),concatenate_array,test_size=0.2)

In [None]:
x_train.shape[0],male_female_train.shape[0],x_train

In [None]:
from gensim.models import doc2vec
from gensim.models.doc2vec import LabeledSentence
LabeledSentence = gensim.models.doc2vec.LabeledSentence

In [None]:
def labelizeReviews(reviews,label_type):
    labelized = []
    for i,v in enumerate(reviews):
        if len(v) == 0:
            continue
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v,[label]))
    return labelized

In [None]:
x_train_label = labelizeReviews(x_train,'TRAIN')
x_test_label = labelizeReviews(x_test,'TEST')

print len(x_train_label),len(x_train)

### We have labelized reviews, now building Doc2Vec models using Distibuted Memory (DM) and Distributed Bag of Words (DBoW)

* **DM** - Given the context (set of paragraphs), predict the next word
* **DBoW** - Given the word, predict the context

In [None]:
import random

In [None]:
size = 300

In [None]:
#dm defines the training algorithm. By default (dm=1), distributed memory is used. Otherwise, dbow is employed.

#size is the dimensionality of the feature vectors.

#window is the maximum distance between the current and predicted word within a sentence.

#alpha is the initial learning rate (will linearly drop to zero as training progresses).

#seed = for the random number generator.

#min_count = ignore all words with total frequency lower than this.

#sample = threshold for configuring which higher-frequency words are randomly downsampled;
#default is 0 (off), useful value is 1e-5.
#workers = use this many worker threads to train the model (=faster training with multicore machines).

#hs = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

#negative = if > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20).

#dm_mean = if 0 (default), use the sum of the context word vectors. If 1, use the mean. Only applies when dm is used.

model_dm = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=5,workers=20)
model_dbow = gensim.models.Doc2Vec(min_count=1,window=10,size=size,sample=1e-3,negative=5,workers=20,dm=0)

In [None]:
model_dm.build_vocab(np.concatenate((x_train_label,x_test_label)))
model_dbow.build_vocab(np.concatenate((x_train_label,x_test_label)))

In [None]:
x_train_label_np = np.array(x_train_label)

In [None]:
x_train_label_np.shape[0]

In [None]:
for epoch in range(10):
    perm = np.random.permutation(x_train_label_np.shape[0])
    model_dm.train(x_train_label_np[perm])
    model_dbow.train(x_train_label_np[perm])

In [None]:
def getVecs(model,corpus,size):
    vecs = [np.array(model[z.labels[0]]).reshape((1,size)) for z in corpus]
    return np.concatenate(vecs)

In [None]:
train_vecs_dm = getVecs(model_dm,x_train_label_np,size)
train_vecs_dbow = getVecs(model_dbow,x_train_label_np,size)

In [None]:
train_vecs = np.hstack((train_vecs_dm,train_vecs_dbow))

In [None]:
train_vecs.shape

In [None]:
x_test_label_np = np.array(x_test_label)

In [None]:
for epoch in range(10):
    perm = np.random.permutation(x_test_label_np.shape[0])
    model_dm.train(x_test_label_np[perm])
    model_dbow.train(x_test_label_np[perm])

In [None]:
test_vecs_dm = getVecs(model_dm,x_test_label_np,size)
test_vecs_dbow = getVecs(model_dbow,x_test_label_np,size)

In [None]:
test_vecs = np.hstack((test_vecs_dm,test_vecs_dbow))
print test_vecs_dm.shape,test_vecs_dbow.shape,male_female_train.shape,male_female_test.shape

#### We have all the vectors now, we have to train the classifier

In [None]:
from sklearn.linear_model import SGDClassifier

#### SGD classifier with L1 regularization

In [None]:
lrl1 = SGDClassifier(loss='log',penalty='l1')

In [None]:
lrl1.fit(train_vecs,male_female_train)

In [None]:
print 'Test Accuracy : %.2f' %lrl1.score(test_vecs,male_female_test)

#### SGD classifier with L2 regularization

In [None]:
lrl2 = SGDClassifier(loss='log',penalty='l2')

In [None]:
lrl2.fit(train_vecs,male_female_train)

In [None]:
print 'Test Accuracy : %.2f' %lrl2.score(test_vecs,male_female_test)

# 5 fold cross validation

In [None]:
from sklearn.cross_validation import KFold
from sklearn import metrics
import pandas as pd

In [None]:
sgd_l1_kf = KFold(n=train_vecs.shape[0],n_folds=5,shuffle=True)

In [None]:
sgd_l1_kf

In [None]:
trained_vecs_df = pd.DataFrame(train_vecs)
target_np = np.array(male_female_train)

In [None]:
trained_vecs_df.head()

In [None]:
trained_vecs_df.shape

In [None]:
sgd_l1_kf

In [None]:
sgd_l1_metrics = []
for train_index, validate_index in sgd_l1_kf:
    sample_train,sample_validate = trained_vecs_df.loc[train_index],trained_vecs_df.loc[validate_index]
    
    sample_train_target,sample_validate_target = male_female_train[train_index],male_female_train[validate_index]
    
    #print sample_train.shape,sample_validate.shape,sample_train_target.shape,sample_validate_target.shape
    
    sgd_l1 = SGDClassifier(loss='log',penalty='l1')
    
    sgd_l1.fit(sample_train,sample_train_target)
    
    sgd_l1_predicted = sgd_l1.predict(sample_validate)
    
    sgd_l1_predicted_copy = sgd_l1_predicted.copy()
    
    sgd_l1_predicted[sgd_l1_predicted > 0.5] = 1
    sgd_l1_predicted[sgd_l1_predicted <= 0.5] = 0
  
    
    sgd_l1_analysis = pd.concat([pd.Series(sample_validate_target),pd.Series(sgd_l1_predicted)],axis=1)

    sgd_l1_analysis.columns = ['actual','prediction']
    
    sgd_l1_auc = metrics.roc_auc_score(sgd_l1_analysis.actual,sgd_l1_analysis.prediction)
        
    sgd_l1_metrics.append((sgd_l1_auc))
    

In [None]:
sgd_l1_metrics_df = pd.DataFrame(sgd_l1_metrics).mean()

In [None]:
import matplotlib
from matplotlib import pyplot

In [None]:
%matplotlib inline

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(sgd_l1_analysis.actual, sgd_l1_predicted_copy)
pyplot.plot(fpr, tpr)
pyplot.plot([0,1],[0,1])