<a href="https://colab.research.google.com/github/Dharani1999/Word-embedding-techniques/blob/master/Code_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
import gensim
from gensim import corpora, models, similarities
from gensim.models import Word2Vec, TfidfModel, LsiModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
def data_module(name,location,output_location,max_seq_length=100):
  dataset1 = pd.read_csv(location)
  dataset = dataset1.iloc[0:1000,:]

  if name == 'word2vec':
    data_corpus, users_total = data_word2vec(dataset,max_seq_length)
  elif name == 'doc2vec':
    data_corpus, users_total = data_doc2vec(dataset,max_seq_length)
  elif name == 'lsi':
    data_corpus, users_total = data_lsi(dataset,max_seq_length)
  elif name == 'tfidf':
    data_corpus, users_total = data_tfidf(dataset,max_seq_length)
  #data_corpus.save('/content/drive/My Drive/Movielensdata/ml25m/data_corpus')
  #print(data_corpus)
  save_object(obj=data_corpus, filename=output_location)
  #return data_corpus, users_total

def data_word2vec(dataset,max_seq_length):
  dataset.sort_values(by=['userId','timestamp'],inplace=True)
  user_total = len(dataset['userId'].unique())
  
  #Selecting the most recent movies rated by each user and padding if necessary
  movie_list = []
  for i in range(user_total):
    list1 = []
    list1 = dataset.loc[dataset['userId'] ==(i+1),['movieId']]['movieId'].tolist()
    if len(list1)>max_seq_length:
      list1 = list1[(len(list1)-max_seq_length):]
    elif len(list1)<max_seq_length:
      list1 = list1+[0 for j in range((max_seq_length-len(list1)))]
      #for j in range((max_seq_length-len(list1))):
       # list1.append(0)
    movie_list.append(list1)
  
  #Selecting the most recent ratings rated by each user and padding if necessary
  rating_list =[]
  for i in range(user_total):
    list2 = []
    list2 = dataset.loc[dataset['userId'] ==(i+1),['rating']]['rating'].tolist()
    if len(list2)>max_seq_length:
      list2 = list2[(len(list2)-max_seq_length):]
    elif len(list2)<max_seq_length:
      list2 = list2+[0 for j in range((max_seq_length-len(list2)))]
      #for j in range((max_seq_length-len(list2))):
       # list2.append(0)
    rating_list.append(list2)
  
  #Creating user_id level transpose matrices
  movies_transpose = pd.DataFrame(data=movie_list,index=[i+1 for i in range(user_total)])
  movies_transpose.index.names = ['userId']
  #print(movies_transpose)

  ratings_transpose = pd.DataFrame(data=rating_list,index=[i+1 for i in range(user_total)])
  ratings_transpose.index.names = ['userId']
  #print(ratings_transpose)

  # Select features from original dataset to form a new dataframe 
  df1 = movies_transpose.iloc[:]# For each row, combine all the columns into one column
  df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)# Store them in a pandas dataframe
  df_clean = pd.DataFrame({'clean': df2})# Create the list of list format of the custom corpus for gensim modeling 
  sent = [row.split(',') for row in df_clean['clean']]

  return sent, user_total

def data_doc2vec(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  tagged_data = []
  tags = []
  
  for i in range(user_total):
    tagged_data = tagged_data + [TaggedDocument(words=Sent[i], tags=[str(i)])]

  return tagged_data, user_total

def data_lsi(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  dictionary = corpora.Dictionary(sent)
  #print(dictionary.token2id)
  corp = [dictionary.doc2bow(text) for text in Sent]
  corpus = np.array([[(id, freq) for id, freq in cp] for cp in corp])
  #corpus = gensim.matutils.Dense2Corpus(np.array(Sent),documents_columns=False)

  return corpus, user_total

def data_tfidf(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  dictionary = corpora.Dictionary(sent)
  #print(dictionary.token2id)
  corp = [dictionary.doc2bow(text) for text in Sent]
  corpus = np.array([[(id, freq) for id, freq in cp] for cp in corp])
  #corpus = gensim.matutils.Dense2Corpus(np.array(Sent),documents_columns=False)
  
  return corpus, user_total

In [3]:
def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)

def load_object(filename):
    with open(filename, 'rb') as input:
        pickle_object = pickle.load(input)
    return  pickle_object

In [4]:
def embedding_model(name,Data_location,model_save_location,vector_dims=50,Sg=1,size_window=3,topics=10,mini_count=1,num_workers=3,max_num_epochs = 100,alpha = 0.025,min_alpha=0.00025,dm=1):
  Data = load_object(Data_location)
  #print(Data)
  if name == 'word2vec':
    word2vec(input_data=Data,save_loc=model_save_location,vec_dims=vector_dims,SG=Sg,size_of_window=size_window,minimum_count=mini_count,no_workers=num_workers)
    #voc = model1.wv
    #words = list(model1.wv.vocab)
    #vectors = model1[model1.wv.vocab]
  elif name == 'doc2vec':
    doc2vec(input_data=Data,save_loc=model_save_location,vec_dims=vector_dims,alpha_=alpha,size_of_window=size_window,no_workers=num_workers,max_epochs=max_num_epochs,min_alpha_=min_alpha,minimum_count=mini_count,dms=dm)
  elif name == 'lsi':
    lsi(input_data=Data,save_loc=model_save_location,total_topics=topics)
  elif name == 'tfidf':
    tfidf(input_data=Data,save_loc=model_save_location)

def word2vec(input_data,save_loc,vec_dims,SG,size_of_window,minimum_count,no_workers):
  model = Word2Vec(input_data,min_count=minimum_count,size= vec_dims,workers=no_workers, window =size_of_window, sg = SG)
  model.save(save_loc)

def doc2vec(input_data,save_loc,vec_dims,alpha_,size_of_window,min_alpha_,minimum_count,dms,no_workers,max_epochs):
  model = Doc2Vec(size=vec_dims,
                alpha=alpha_, 
                min_alpha=min_alpha_,
                window = size_of_window,
                min_count=minimum_count,
                dm =dms)
  model.build_vocab(input_data)

  for epoch in range(max_epochs):
    #print('iteration {0}'.format(epoch))
    model.train(input_data, total_examples=model.corpus_count, epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
  model.save(save_loc)

def lsi(input_data,save_loc,total_topics):
  model = models.LsiModel(corpus=input_data, num_topics=total_topics)
  index = similarities.MatrixSimilarity(model[input_data])
  lsi_data = model[input_data]
  lsi_topics = model.print_topics()
  #for topic in lsi_topics:
    #print(topic)
  model.save(save_loc)

def tfidf(input_data,save_loc):
  model = models.TfidfModel(corpus=input_data)
  tfidf_data = model[input_data]

  tfidf_token= np.zeros((len(tfidf_data), 350), dtype=np.float64)
  tfidf_vals= np.zeros((len(tfidf_data), 350), dtype=np.float64)
 
  for i in range(len(input_data)):
    for k in range(len(list(tfidf_data)[i])):
      tfidf_token[i][k]=(list(tfidf_data))[i][k][0]
      tfidf_vals[i][k]=(list(tfidf_data))[i][k][1]
  tfidf_list=list(tfidf_data)
  #print(list(tfidf_data))
  model.save('tfidf_model')

In [5]:
def scoring_module(name,model_loc,data):
  if name == 'word2vec':
    scores_list = score_word2vec(model_location=model_loc,input_data=data)
  if name == 'doc2vec':
    scores_list = score_doc2vec(model_location=model_loc,input_data=data)
  if name == 'lsi':
    scores_list = score_lsi(model_location=model_loc,input_data=data)
  if name == 'tfidf':
    scores_list = score_tfidf(model_location=model_loc,input_data=data)
  
  return scores_list

In [6]:
def score_word2vec(model_location,input_data):
  model = Word2Vec.load(model_location)
  scored_data = model[input_data] 
  return scored_data

In [7]:
def score_doc2vec(model_location,input_data):
  model = Doc2Vec.load(model_location)
  scored_data = model[input_data]
  return scored_data

In [8]:
def score_lsi(model_location,input_data):
  model = LsiModel.load(model_location)
  scored_data = model[input_data]
  return scored_data

In [9]:
def score_tfidf(model_location,input_data):
  model = TfidfModel.load(model_location)
  scored_data = model[input_data]
  return scored_data

In [10]:
data_module(name='word2vec',max_seq_length=100,location='/content/drive/My Drive/Movielensdata/ml25m/ratings.csv',output_location='/content/drive/My Drive/Movielensdata/ml25m/data_corpus/data')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [11]:
embedding_model(name='word2vec',Data_location='/content/drive/My Drive/Movielensdata/ml25m/data_corpus/data',model_save_location='/content/drive/My Drive/Movielensdata/ml25m/word2vec/w2v',vector_dims=50,Sg=1,size_window=3,topics=10,mini_count=1,num_workers=3,max_num_epochs = 100,alpha = 0.025,min_alpha=0.00025,dm=1)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [12]:
scoring_input = ['899', '2161', '3949', '5878', '1175', '1237', '8154', '2843', '7365', '4422', '6016', '1080', '3114', '3671', '2791', '1288', '1', '541', '2692', '7323', '8014', '6370', '4703', '5147']

In [13]:
scored_list = scoring_module(name='word2vec',model_loc='/content/drive/My Drive/Movielensdata/ml25m/word2vec/w2v',data=scoring_input)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
print(scored_list[0])

[ 0.00459596 -0.00241911  0.00612697 -0.01001128  0.00030483 -0.0042486
  0.00780082  0.00636277  0.00383855 -0.00856883 -0.00715745 -0.00479852
 -0.0006926   0.00282047  0.00216675  0.00863381  0.00512881 -0.00274457
  0.00404377 -0.005483    0.00731773  0.00408469  0.00272944  0.0080032
 -0.00147792 -0.00099985  0.00182403  0.00825111 -0.00581281  0.00939213
  0.0007721  -0.00075137 -0.00421297  0.00307526 -0.00747749 -0.0002278
 -0.00612344  0.00993508 -0.00213574  0.00380339  0.0031376   0.00041305
 -0.00645378  0.00744731 -0.00217777  0.00680043  0.0016865  -0.00305453
 -0.00209291  0.00616063]


In [15]:
scored_list

array([[ 0.00459596, -0.00241911,  0.00612697, ..., -0.00305453,
        -0.00209291,  0.00616063],
       [ 0.0084547 ,  0.00522159, -0.00043755, ..., -0.00181077,
         0.00937611,  0.00562771],
       [-0.00930926, -0.00691647, -0.00987944, ..., -0.00550414,
        -0.00242318, -0.00017445],
       ...,
       [-0.00838138, -0.00657606, -0.00526562, ..., -0.00740144,
         0.00497558,  0.00963203],
       [-0.00896329,  0.00793444,  0.00134797, ...,  0.00510431,
        -0.00557781, -0.00035319],
       [-0.00168067,  0.00494596,  0.0040232 , ..., -0.00881966,
        -0.00635621, -0.00876874]], dtype=float32)