<a href="https://colab.research.google.com/github/Dharani1999/Word-embedding-techniques/blob/master/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
import gensim
from gensim import corpora, models, similarities
from gensim.models import Word2Vec, TfidfModel, LsiModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models.fasttext import FastText
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    with open(filename, 'rb') as input:
        pickle_object = pickle.load(input)
    return  pickle_object

In [2]:
def data_module(name, location, output_location, dict_location, max_seq_length=100):
  dataset1 = pd.read_csv(location)
  dataset = dataset1.iloc[0:1000000,:]

  if name == 'word2vec':
    data_corpus, users_total = data_word2vec(dataset,max_seq_length)
  elif name == 'doc2vec':
    data_corpus, users_total = data_doc2vec(dataset,max_seq_length)
  elif name == 'lsi':
    data_corpus, users_total = data_lsi(dataset,max_seq_length,dict_loc=dict_location)
  elif name == 'tfidf':
    data_corpus, users_total = data_lsi(dataset,max_seq_length,dict_loc=dict_location)
  elif name == 'glove':
    data_corpus, users_total = data_word2vec(dataset,max_seq_length)
  elif name == 'hashing':
    data_corpus, users_total = data_hashing(dataset,max_seq_length)
  elif name == 'cooccur':
    data_corpus, users_total = data_hashing(dataset,max_seq_length)
  #elif name == 'fasttext':
   # data_corpus, users_total = data_word2vec(dataset,max_seq_length)
  #data_corpus.save('/content/drive/My Drive/Movielensdata/ml25m/data_corpus')
  #print(data_corpus)
  save_object(obj=data_corpus, filename=output_location)
  #return data_corpus, users_total

def data_word2vec(dataset,max_seq_length):
  dataset.sort_values(by=['userId','timestamp'],inplace=True)
  user_total = len(dataset['userId'].unique())
  
  #Selecting the most recent movies rated by each user and padding if necessary
  movie_list = []
  for i in range(user_total):
    list1 = []
    list1 = dataset.loc[dataset['userId'] ==(i+1),['movieId']]['movieId'].tolist()
    if len(list1)>max_seq_length:
      list1 = list1[(len(list1)-max_seq_length):]
    elif len(list1)<max_seq_length:
      list1 = list1+[0 for j in range((max_seq_length-len(list1)))]
      #for j in range((max_seq_length-len(list1))):
       # list1.append(0)
    movie_list.append(list1)
  
  #Selecting the most recent ratings rated by each user and padding if necessary
  rating_list =[]
  for i in range(user_total):
    list2 = []
    list2 = dataset.loc[dataset['userId'] ==(i+1),['rating']]['rating'].tolist()
    if len(list2)>max_seq_length:
      list2 = list2[(len(list2)-max_seq_length):]
    elif len(list2)<max_seq_length:
      list2 = list2+[0 for j in range((max_seq_length-len(list2)))]
      #for j in range((max_seq_length-len(list2))):
       # list2.append(0)
    rating_list.append(list2)
  
  #Creating user_id level transpose matrices
  movies_transpose = pd.DataFrame(data=movie_list,index=[i+1 for i in range(user_total)])
  movies_transpose.index.names = ['userId']
  #print(movies_transpose)

  ratings_transpose = pd.DataFrame(data=rating_list,index=[i+1 for i in range(user_total)])
  ratings_transpose.index.names = ['userId']
  #print(ratings_transpose)

  # Select features from original dataset to form a new dataframe 
  df1 = movies_transpose.iloc[:]# For each row, combine all the columns into one column
  df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)# Store them in a pandas dataframe
  df_clean = pd.DataFrame({'clean': df2})# Create the list of list format of the custom corpus for gensim modeling 
  sent = [row.split(',') for row in df_clean['clean']]

  return sent, user_total

def data_doc2vec(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  tagged_data = []
  tags = []
  
  for i in range(user_total):
    tagged_data = tagged_data + [TaggedDocument(words=Sent[i], tags=[str(i)])]

  return tagged_data, user_total

def data_lsi(dataset,max_seq_length,dict_loc):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  dictionary = corpora.Dictionary(Sent)
  #print(dictionary.token2id)
  corpus = [dictionary.doc2bow(text) for text in Sent]
  dictionary.save(dict_loc)
  #corpus = np.array([[(id, freq) for id, freq in cp] for cp in corp])
  #corpus = gensim.matutils.Dense2Corpus(np.array(Sent),documents_columns=False)

  return corpus, user_total

def data_hashing(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  corpus = [str(str(doc)[1:-1]) for doc in Sent]
  return corpus, user_total

In [3]:
def embedding_model(name,Data_location,model_save_location,matrix_location, vector_dims=10,Sg=1,size_window=3,topics=10,mini_count=1,num_workers=3,max_num_epochs = 100,alpha = 0.025,min_alpha=0.00025,dm=1, maxi_features=None):
  Data = load_object(Data_location)
  #print(Data)
  if name == 'word2vec':
    word2vec(input_data=Data, save_loc=model_save_location, vec_dims=vector_dims, SG=Sg, size_of_window=size_window, minimum_count=mini_count, no_workers=num_workers)
    #voc = model1.wv
    #words = list(model1.wv.vocab)
    #vectors = model1[model1.wv.vocab]
  elif name == 'doc2vec':
    doc2vec(input_data=Data, save_loc=model_save_location, vec_dims=vector_dims, alpha_=alpha, size_of_window=size_window, no_workers=num_workers, max_epochs=max_num_epochs,min_alpha_=min_alpha, minimum_count=mini_count, dms=dm)
  elif name == 'lsi':
    lsi(input_data=Data, save_loc=model_save_location, total_topics=topics)
  elif name == 'tfidf':
    tfidf(input_data=Data,save_loc=model_save_location)
  elif name == 'glove':
    glove_model(input_data=Data, vec_dims=vector_dims, size_of_window=size_window, save_loc=model_save_location, num_epochs=max_num_epochs, alpha_=0.05, num_threads=4)
  elif name == 'hashing':
    hashing(input_data=Data, vec_dims=vector_dims, save_loc=model_save_location)
  elif name == 'cooccur':
    co_occur(input_data=Data, maximum_features=maxi_features, save_loc=model_save_location, matrix_loc = matrix_location)
  #elif name == 'fasttext':
   # fast_text(input_data=Data, save_loc=model_save_location, vec_dims=vector_dims, SG=Sg, size_of_window=size_window, minimum_count=mini_count, no_workers=num_workers, alpha_=0.025)

def word2vec(input_data,save_loc,vec_dims,SG,size_of_window,minimum_count,no_workers):
  model = Word2Vec(input_data,min_count=minimum_count,size= vec_dims,workers=no_workers, window =size_of_window, sg = SG)
  model.save(save_loc)

def doc2vec(input_data,save_loc,vec_dims,alpha_,size_of_window,min_alpha_,minimum_count,dms,no_workers,max_epochs):
  model = Doc2Vec(size=vec_dims,
                alpha=alpha_, 
                min_alpha=min_alpha_,
                window = size_of_window,
                min_count=minimum_count,
                dm =dms)
  model.build_vocab(input_data)

  for epoch in range(max_epochs):
    #print('iteration {0}'.format(epoch))
    model.train(input_data, total_examples=model.corpus_count, epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
  model.save(save_loc)

def lsi(input_data,save_loc,total_topics):
  model = models.LsiModel(corpus=input_data, num_topics=total_topics)
  index = similarities.MatrixSimilarity(model[input_data])
  lsi_data = model[input_data]
  lsi_topics = model.print_topics()
  #for topic in lsi_topics:
    #print(topic)
  model.save(save_loc)

def tfidf(input_data,save_loc):
  model = models.TfidfModel(corpus=input_data)
  tfidf_data = model[input_data]

  tfidf_token= np.zeros((len(tfidf_data), 350), dtype=np.float64)
  tfidf_vals= np.zeros((len(tfidf_data), 350), dtype=np.float64)
 
  for i in range(len(input_data)):
    for k in range(len(list(tfidf_data)[i])):
      tfidf_token[i][k]=(list(tfidf_data))[i][k][0]
      tfidf_vals[i][k]=(list(tfidf_data))[i][k][1]
  tfidf_list=list(tfidf_data)
  #print(list(tfidf_data))
  model.save(save_loc)

def glove_model(input_data,vec_dims,size_of_window,save_loc,alpha_=0.05,num_epochs=30, num_threads=4):
  #importing the glove library
  corpus = Corpus() #training the corpus to generate the co occurence matrix which is used in GloVe
  corpus.fit(input_data, window=size_of_window)#creating a Glove object which will use the matrix created in the above lines to create embeddings
  #We can set the learning rate as it uses Gradient Descent and number of components
  glove = Glove(no_components=vec_dims, learning_rate=alpha_) 
  glove.fit(corpus.matrix, epochs=num_epochs, no_threads=4, verbose=True)
  glove.add_dictionary(corpus.dictionary)
  glove.save(save_loc)

def fast_text(input_data,save_loc,vec_dims,SG,size_of_window,minimum_count,no_workers,alpha_=0.025):
  model = FastText(min_count=minimum_count, alpha=alpha_, size= vec_dims, workers=no_workers, window =size_of_window)
  model.build_vocab(input_data)
  model.train(input_data, epochs=model.epochs, total_examples=model.corpus_count, total_words=model.corpus_total_words)
  model.save(save_loc)

def hashing(input_data,vec_dims,save_loc):
  model = HashingVectorizer(n_features=vec_dims)
  model.transform(input_data)
  #vectors = model.toarray()
  #vocab = model.get_feature_names()
  save_object(obj=model, filename=save_loc)

def co_occur(input_data, save_loc, matrix_loc, maximum_features):
  #print(input_data[3])
  model = CountVectorizer(ngram_range=(1,1),max_features=maximum_features, token_pattern= r"(?u)\b\w+\b")
  X = model.fit_transform(input_data)
  Xc = (X.T * X)
  Xc.setdiag(0)
  #cooccur = Xc.todense()
  names = model.get_feature_names() # This are the entity names (i.e. keywords)
  save_object(obj=names, filename='/content/drive/My Drive/Movielensdata/ml25m/cooccur/vocab')
  df = pd.DataFrame(data = Xc.toarray(), columns = names, index = names)
  save_object(obj=model, filename=save_loc)
  save_object(obj=df, filename=matrix_loc)

In [4]:
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ou20z5ex
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-ou20z5ex
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 6.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 38.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K  

In [5]:
data_module(name='cooccur',max_seq_length=100,location='/content/drive/My Drive/Movielensdata/ml25m/ratings.csv',output_location='/content/drive/My Drive/Movielensdata/ml25m/cooccur/data', dict_location='/content/drive/My Drive/Movielensdata/ml25m/tfidf/dict')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
embedding_model(name='cooccur',Data_location='/content/drive/My Drive/Movielensdata/ml25m/cooccur/data',model_save_location='/content/drive/My Drive/Movielensdata/ml25m/cooccur/co_occur', matrix_location='/content/drive/My Drive/Movielensdata/ml25m/cooccur/matrix', vector_dims=10,Sg=1,size_window=3,topics=10,mini_count=1,num_workers=3,max_num_epochs = 10,alpha = 0.025,min_alpha=0.00025,dm=1,maxi_features=None)

In [7]:
bert_vocab = load_object('/content/drive/My Drive/Movielensdata/ml25m/cooccur/vocab')

In [8]:
print(bert_vocab[0:10])

['0', '1', '10', '100', '1000', '100017', '100044', '100046', '100083', '1001']


In [9]:
VOC_FNAME = "vocab.txt"

with open(VOC_FNAME, "w") as fo:
  for token in bert_vocab:
    fo.write(token+"\n")

In [10]:
from pathlib import Path

#from tokenizers import ByteLevelBPETokenizer
from transformers import BertTokenizer
#paths = [str(x) for x in Path(".").glob("**/*.txt")]
paths = '/content/vocab.txt'
# Initialize a tokenizer
tokenizer = BertTokenizer(vocab_file=paths,  vocab_size=52_000, min_frequency=1)

# Customize training

In [11]:
!mkdir BERT
tokenizer.save_pretrained("BERT")

('BERT/vocab.txt', 'BERT/special_tokens_map.json', 'BERT/added_tokens.json')

In [12]:
#from tokenizers.implementations import BertTokenizer
from tokenizers.processors import BertProcessing


tokenizer = BertTokenizer(
    'BERT/vocab.txt', 'BERT/special_tokens_map.json', 'BERT/added_tokens.json')

In [13]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

AttributeError: ignored

In [14]:
tokenizer.encode("'5952', '2012', '0'")
#tokenizer.encode("name time place")

[None, None, 10607, None, None, None, 4855, None, None, None, 0, None, None]

In [15]:
tokenizer.encode("'5952', '2012', '0'").tokens

AttributeError: ignored

In [16]:
# Check that we have a GPU
!nvidia-smi

Wed Jul 15 07:59:42 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   71C    P8    34W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [17]:
import torch
torch.cuda.is_available()

True

In [18]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [19]:
from transformers import BertTokenizerFast
max_len=512
tokenizer = BertTokenizerFast.from_pretrained("./BERT")

TypeError: ignored

In [None]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(config=config)

In [None]:
model.num_parameters()
# => 84 million parameters

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="/content/vocab.txt",
    block_size=128,
)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./BERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
    prediction_loss_only=True,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("./BERT")

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./BERT",
    tokenizer="./BERT"
)

In [None]:
#106916
fill_mask("'51935', '69526' <mask>.")

In [None]:
fill_mask("'4709', '96079', '924', '148426', '168612' <mask>.")
#1265