<a href="https://colab.research.google.com/github/lzhenCloudAI/NLP-training/blob/master/Deep_Learning_and_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import pandas as pd
import os
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import gensim.downloader as api
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [2]:
################
# get the data #
################
# code source https://www.tensorflow.org/hub/tutorials/text_classification_with_tf_hub
pd.set_option('display.max_colwidth', 2000)

def load_directory_data(directory):
  data = {}
  data["sentence"] = []
  data["sentiment"] = []
  for file_path in os.listdir(directory):
    with tf.io.gfile.GFile(os.path.join(directory, file_path), "r") as f:
      data["sentence"].append(f.read())
      data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
  return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
  pos_df = load_directory_data(os.path.join(directory, "pos"))
  neg_df = load_directory_data(os.path.join(directory, "neg"))
  pos_df["polarity"] = 1
  neg_df["polarity"] = 0
  return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
  dataset = tf.keras.utils.get_file(
      fname="aclImdb.tar.gz", 
      origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
      extract=True)
  
  train_df = load_dataset(os.path.join(os.path.dirname(dataset), "aclImdb", "train"))
  test_df = load_dataset(os.path.join(os.path.dirname(dataset),  "aclImdb", "test"))
  return train_df, test_df

train, test = download_and_load_datasets()

train['dataSplit']='train'
test['dataSplit']='test'
mydata = pd.concat([train, test], axis=0)

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
######################
# data preprocessing #
######################

# https://github.com/bryan-c-castillo/MLADS-TextEmbedding-Bert-Elmo-Tutorial

nltk.download('wordnet')
nltk.download('stopwords')
max_words = 128

text_http_re  = re.compile(r'http\S+')
text_digit_re = re.compile(r'[0-9]')
text_html_re  = re.compile(r'<[^>]{0,20}>')
text_punc_re  = re.compile('[' + re.escape('\'!"#$%&()*+-/:;<=>?@[\\]^_`{|}~') + ']')
text_ws_re    = re.compile('\s+')

def clean_text(text):
    text = text.lower()
    text = text_http_re.sub('', text)
    text = text_html_re.sub('', text)
    text = text_digit_re.sub(' ', text)
    text = text_punc_re.sub('', text)
    text = text_ws_re.sub(' ', text)
    text = text.strip()
    return text

def create_lemmatizer_spacy():
    nlp = spacy.load('en', disable=['parser', 'ner'])
    def lemmatize(text):
        return ' '.join([token.lemma_ for token in nlp(text)][0:max_words])
    
    return lemmatize

def create_lemmatizer_nltk():
    from nltk.stem import WordNetLemmatizer 
    lemmatizer = WordNetLemmatizer()
    
    def lemmatize(text):
        return ' '.join([lemmatizer.lemmatize(w) for w in text.split()][0:max_words])
    
    return lemmatize

# Setup a lemmatize function, spacy.load may fail on windows for en.
try:
    lemmatize = create_lemmatizer_spacy()
except:
    print("Using nltk for lemmatization.")
    lemmatize = create_lemmatizer_nltk()
            
def process_text(text):
    return lemmatize(clean_text(text))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Using nltk for lemmatization.


In [0]:
mydata['clean_review'] = mydata.sentence.apply(process_text)

In [0]:
#########
#TF-IDF #
######### 

In [0]:
def tfidf_vec(mydata):
    tfidf = TfidfVectorizer(min_df=100, max_df=0.2, ngram_range=(1,1))
    #min_df: When building the vocabulary ignore terms that have a document frequency strictly lower than the given threshold.
    #max_df: When building the vocabulary ignore terms that have a document frequency strictly higher than the given threshold. 
    #ngram_range: unigram
    tfidf.fit(mydata["clean_review"])
    features = tfidf.transform(mydata["clean_review"])
    return pd.DataFrame(features.todense(), columns = tfidf.get_feature_names())


In [0]:
def xgboostFun(train_tfidf, test_tfidf) : 
  train_tfidf2=train_tfidf.drop(['dataSplit', 'polarity'], axis=1)
  xgb = XGBClassifier(max_depth=6)
  xgb.fit(train_tfidf2, train_tfidf.polarity)

  test_tfidf2=test_tfidf.drop(['dataSplit', 'polarity'], axis=1)
  predictions = xgb.predict_proba(test_tfidf2)
  return roc_auc_score(test.polarity, predictions[:,1])

In [7]:
tfidf_feature=tfidf_vec(mydata)
tfidf_feature.head(2)

Unnamed: 0,00,000,10,100,101,11,12,13,13th,14,15,16,17,18,19,1930,1930s,1939,1940,1940s,1945,1950,1950s,1959,1960,1960s,1968,1969,1970,1970s,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,...,wrenching,wrestling,wretched,write,writer,writers,writes,writing,written,wrong,wrote,wtf,wwii,www,ya,yard,yawn,yeah,year,years,yelling,yellow,yep,yes,yesterday,yet,york,young,younger,youngest,your,yours,yourself,youth,youthful,youtube,zero,zombie,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.10078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
mydata=mydata.reset_index()
tfidf_feature['polarity']=mydata['polarity']
tfidf_feature['dataSplit']=mydata['dataSplit']
train_tfidf=tfidf_feature.loc[tfidf_feature['dataSplit']=='train']
test_tfidf=tfidf_feature.loc[tfidf_feature['dataSplit']=='test']
# too many features and need to do feature selection

In [0]:
# this takes a long time and I will suggest you run the code after the tutorial. 
tfidfAUC=xgboostFun(train_tfidf, test_tfidf) #auc0.91

In [0]:
###########################################
# word2vec - could be useful in labelling #
###########################################

In [0]:
def word2vecFun(mydata, wordToCheck): 
  # this is to train word2vec on movie data to get similar word
  # to get the word format
  wordList=list()
  lines=mydata.clean_review.values.tolist()
  for line in lines: 
    token=line.lower().split()
    wordList.append(token)

  model=gensim.models.Word2Vec(sentences=wordList, size=50, window=5, min_count=10)
  words=list(model.wv.vocab)
  out=model.wv.most_similar(wordToCheck)
  return out

In [51]:
word2vecFun(mydata, 'outstanding')

  if np.issubdtype(vec.dtype, np.int):


[('excellent', 0.889236330986023),
 ('exceptional', 0.8805518746376038),
 ('superb', 0.8580080270767212),
 ('stellar', 0.8410035371780396),
 ('masterful', 0.829212486743927),
 ('phenomenal', 0.8242124319076538),
 ('excellent,', 0.8192064166069031),
 ('magnificent', 0.8180720806121826),
 ('marvelous', 0.8142855167388916),
 ('terrific', 0.8116052150726318)]

In [52]:
# # load pre-trained word-vectors from gensim-data
word_vectors = api.load("glove-wiki-gigaword-100")  
print(word_vectors.similarity('happy', 'glad'))
result = word_vectors.similar_by_word("angry")
print(result)
sim = word_vectors.n_similarity(['replace', 'disk'], ['config', 'firmware'])
print(sim)



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  if np.issubdtype(vec.dtype, np.int):


0.783336
[('furious', 0.8143535256385803), ('outraged', 0.7746474146842957), ('enraged', 0.7717769742012024), ('irate', 0.7348275184631348), ('frustrated', 0.7237215042114258), ('angered', 0.7120067477226257), ('frightened', 0.7118747234344482), ('shocked', 0.69240403175354), ('fearful', 0.6863806843757629), ('crowd', 0.6827539205551147)]
0.30528376


In [0]:
###########
# doc2vec #
###########

In [0]:
def label_sentences(df):
   docs=df.clean_review
   tagged_data = [TaggedDocument(words=d.lower().split(), tags=[str(i)]) for i, d in enumerate(docs)]
   return tagged_data

def get_vectors(model, corpus, size):
    # get vectors from doc2vec 
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'SENT_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

def doc2vecFun(mydata): 
  sen = label_sentences(mydata) 
  model = Doc2Vec(sen, dm=0, vector_size=20, window=5, min_count=3)  
  train_vecs_dbow = get_vectors(model, mydata['clean_review'], 20) 
  out=pd.DataFrame(train_vecs_dbow)
  out.columns=['var'+str(i) for i in range(out.shape[1])]
  return out

In [0]:
out=doc2vecFun(mydata)
out['polarity']=mydata['polarity']
out['dataSplit']=mydata['dataSplit']
train_doc2vec=out.loc[out['dataSplit']=='train']
test_doc2vec=out.loc[out['dataSplit']=='test']

In [0]:
doc2vecAUC=xgboostFun(train_doc2vec, test_doc2vec)
#auc 0.95, pretty good!

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)