<a href="https://colab.research.google.com/github/Dharani1999/Word-embedding-techniques/blob/master/Code_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
import gensim
from gensim import corpora, models, similarities
from gensim.models import Word2Vec, TfidfModel, LsiModel
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
def data_module(name,location,max_seq_length=100):
  dataset1 = pd.read_csv(location)
  dataset = dataset1.iloc[0:1000,:]

  if name == 'word2vec':
    data_corpus, users_total = data_word2vec(dataset,max_seq_length)
  elif name == 'doc2vec':
    data_corpus, users_total = data_doc2vec(dataset,max_seq_length)
  elif name == 'lsi':
    data_corpus, users_total = data_lsi(dataset,max_seq_length)
  elif name == 'tfidf':
    data_corpus, users_total = data_tfidf(dataset,max_seq_length)
  
  return data_corpus, users_total

def data_word2vec(dataset,max_seq_length):
  dataset.sort_values(by=['userId','timestamp'],inplace=True)
  user_total = len(dataset['userId'].unique())
  
  #Selecting the most recent movies rated by each user and padding if necessary
  movie_list = []
  for i in range(user_total):
    list1 = []
    list1 = dataset.loc[dataset['userId'] ==(i+1),['movieId']]['movieId'].tolist()
    if len(list1)>max_seq_length:
      list1 = list1[(len(list1)-max_seq_length):]
    elif len(list1)<max_seq_length:
      for j in range((max_seq_length-len(list1))):
        list1.append(0)
    movie_list.append(list1)
  
  #Selecting the most recent ratings rated by each user and padding if necessary
  rating_list =[]
  for i in range(user_total):
    list2 = []
    list2 = dataset.loc[dataset['userId'] ==(i+1),['rating']]['rating'].tolist()
    if len(list2)>max_seq_length:
      list2 = list2[(len(list2)-max_seq_length):]
    elif len(list2)<max_seq_length:
      for j in range((max_seq_length-len(list2))):
        list2.append(0)
    rating_list.append(list2)
  
  #Creating user_id level transpose matrices
  movies_transpose = pd.DataFrame(data=movie_list,index=[i+1 for i in range(user_total)])
  movies_transpose.index.names = ['userId']
  #print(movies_transpose)

  ratings_transpose = pd.DataFrame(data=rating_list,index=[i+1 for i in range(user_total)])
  ratings_transpose.index.names = ['userId']
  #print(ratings_transpose)

  # Select features from original dataset to form a new dataframe 
  df1 = movies_transpose.iloc[:]# For each row, combine all the columns into one column
  df2 = df1.apply(lambda x: ','.join(x.astype(str)), axis=1)# Store them in a pandas dataframe
  df_clean = pd.DataFrame({'clean': df2})# Create the list of list format of the custom corpus for gensim modeling 
  sent = [row.split(',') for row in df_clean['clean']]

  return sent, user_total

def data_doc2vec(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  tagged_data = []
  tags = []
  
  for i in range(user_total):
    tagged_data = tagged_data + [TaggedDocument(words=Sent[i], tags=[str(i)])]

  return tagged_data, user_total

def data_lsi(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  corpus = gensim.matutils.Dense2Corpus(np.array(Sent),documents_columns=False)
  #print(list(corpus))

  return corpus, user_total

def data_tfidf(dataset,max_seq_length):
  Sent, user_total = data_word2vec(dataset,max_seq_length)
  corpus = gensim.matutils.Dense2Corpus(np.array(Sent),documents_columns=False)
  
  return corpus, user_total

In [3]:
def embedding_model(name,Data,model_save_location,vector_dims=50,Sg=1,size_window=3,topics=10,mini_count=1,num_workers=3,max_num_epochs = 100,alpha = 0.025,min_alpha=0.00025,dm=1):
  if name == 'word2vec':
    word2vec(input_data=Data,save_loc=model_save_location,vec_dims=vector_dims,SG=Sg,size_of_window=size_window,minimum_count=mini_count,no_workers=num_workers)
    #voc = model1.wv
    #words = list(model1.wv.vocab)
    #vectors = model1[model1.wv.vocab]
  elif name == 'doc2vec':
    doc2vec(input_data=Data,save_loc=model_save_location,vec_dims=vector_dims,alpha_=alpha,size_of_window=size_window,no_workers=num_workers,max_epochs=max_num_epochs,min_alpha_=min_alpha,minimum_count=mini_count,dms=dm)
  elif name == 'lsi':
    lsi(input_data=Data,save_loc=model_save_location,total_topics=topics)
  elif name == 'tfidf':
    tfidf(input_data=Data,save_loc=model_save_location)

def word2vec(input_data,save_loc,vec_dims,SG,size_of_window,minimum_count,no_workers):
  model = Word2Vec(input_data,min_count=minimum_count,size= vec_dims,workers=no_workers, window =size_of_window, sg = SG)
  model.save(save_loc)

def doc2vec(input_data,save_loc,vec_dims,alpha_,size_of_window,min_alpha_,minimum_count,dms,no_workers,max_epochs):
  model = Doc2Vec(size=vec_dims,
                alpha=alpha_, 
                min_alpha=min_alpha_,
                window = size_of_window,
                min_count=minimum_count,
                dm =dms)
  model.build_vocab(input_data)

  for epoch in range(max_epochs):
    #print('iteration {0}'.format(epoch))
    model.train(input_data, total_examples=model.corpus_count, epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha
  model.save(save_loc)

def lsi(input_data,save_loc,total_topics):
  model = models.LsiModel(corpus=input_data, num_topics=total_topics)
  index = similarities.MatrixSimilarity(model[input_data])
  lsi_data = model[input_data]
  lsi_topics = model.print_topics()
  #for topic in lsi_topics:
   # print(topic)
  model.save(save_loc)

def tfidf(input_data,save_loc):
  model = models.TfidfModel(corpus=input_data)
  tfidf_data = model[input_data]

  tfidf_token= np.zeros((len(tfidf_data), 350), dtype=np.float64)
  tfidf_vals= np.zeros((len(tfidf_data), 350), dtype=np.float64)
 
  for i in range(len(input_data)):
    for k in range(len(list(tfidf_data)[i])):
      tfidf_token[i][k]=(list(tfidf_data))[i][k][0]
      tfidf_vals[i][k]=(list(tfidf_data))[i][k][1]
  tfidf_list=list(tfidf_data)
  #print(list(tfidf_data))
  model.save('tfidf_model')

In [4]:
corpus_data, num_users = data_module(name='word2vec',max_seq_length=100,location='/content/drive/My Drive/Movielensdata/ml25m/ratings.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [5]:
embedding_model(name='word2vec',Data=corpus_data,model_save_location='/content/drive/My Drive/Movielensdata/ml25m/word2vec/w2v',vector_dims=50,Sg=1,size_window=3,topics=10,mini_count=1,num_workers=3,max_num_epochs = 100,alpha = 0.025,min_alpha=0.00025,dm=1)

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [6]:
print(corpus_data)

[['5952', '2012', '2011', '1653', '1250', '6539', '6377', '3448', '1088', '899', '4308', '2161', '6711', '3949', '8360', '5878', '306', '1175', '307', '1237', '7327', '8154', '7234', '2843', '4144', '7365', '2068', '4422', '4973', '6016', '8873', '2692', '27721', '7323', '6954', '8014', '7939', '6370', '8973', '4703', '31956', '5147', '8786', '1260', '2351', '7940', '7209', '8685', '7820', '7937', '7938', '8405', '4325', '2632', '1217', '8729', '5912', '5767', '665', '2573', '27266', '8327', '32591', '5269', '3569', '27193', '5684', '7318', '296', '7361', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'], ['4306', '8368', '8360', '3793', '4995', '4963', '8636', '2355', '8665', '2571', '589', '33493', '7153', '1873', '1246', '1584', '3994', '2139', '31923', '4720', '2294', '2745', '2138', '858', '33660', '2268', '2501', '6539', '1270', '6565', '4535', '1198', '1302', '349', '3098', '1907'

In [7]:
print(num_users)

4
