In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import torch
import numpy as np
from collections import Counter
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import math

# Concatenation of embeddings

### Desc_Bert_768 || AuthorContext_FastText_60

##### Data merge




In [None]:
#previous embedding performed through "SingleEmbed_OrdinalReg.ipynb"
#embeddings with BERT of books description
df_bert_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/Books_description_bert.csv", sep = "|")
#embeddings with FASTTEXT of ["book_title", "book_author", "Category", "Category_other", "author_genres", "author_genres_other"]
df_fastText_auth_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/authorsContext_fastText.csv", sep = "|")

In [None]:
df_bert = df_bert_temp.copy()
df_fastText_auth = df_fastText_auth_temp.copy()

#All lines of "bothIbSites_InternetSearch_AllBooks_cleaned_w2v.csv" have been kept (105 508)
#but when possible, an embedding column has been added
print(len(df_bert))
print(len(df_fastText_auth))


105508
105508


In [None]:
#I only keep the embeddings (saved as a string)
X1 = df_bert[["isbn", "book_language", "description_bert"]]
s1 = 768

X2 = df_fastText_auth["authorsContext_fastText"]
s2 = 60

#Concatenation (instead of merge) because those embeddings are saved 
#on an additional column of full df_books
df_embed = pd.concat([X1, X2], axis = 1)
print(len(df_embed))
df_embed.head()

105508


Unnamed: 0,isbn,book_language,description_bert,authorsContext_fastText
0,439023483,en,0,"['-0.10757246970929005', '-0.09679322073085082..."
1,316015849,en,"['-0.098250076', '-0.38007897', '0.8143984', '...","['-0.11777790052996527', '-0.08616064630088306..."
2,525478817,en,"['-0.099239185', '-0.12767254', '0.6324886', '...","['-0.11273428555127353', '-0.08976304471012085..."
3,1416524797,en,"['-0.13360432', '-0.23354772', '0.47973692', '...","['-0.11270047592309614', '-0.08497616087940212..."
4,62024035,en,"['-0.26884735', '-0.1411476', '0.30364788', '-...","['-0.13137996882200242', '-0.07718185376375913..."


In [None]:
#ratings of Book Crossing
#df_ratings_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/BX-Book-Ratings.csv", sep = ";", error_bad_lines=False, encoding="latin-1")

#Created ratings (from the BookCrossing ones), in a trial to balance classes (see "SingleEmbed_OrdinalReg.ipynb")
df_ratings_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/BX-Book-Ratings-Binary.csv", sep = "|", error_bad_lines=False, encoding="latin-1")

df_ratings = df_ratings_temp.copy()

In [None]:
#Merge between embeddings and ratings dataframe
df_embed = pd.merge(df_ratings, df_embed, left_on = "ISBN", right_on = "isbn")  
print(len(df_embed))
df_embed.head(1)

64148


Unnamed: 0,User-ID,ISBN,Book-Rating,isbn,book_language,description_bert,authorsContext_fastText
0,190708,044015393X,0.0,044015393X,en,0,"['-0.11191713949665427', '-0.08290330194202918..."


In [None]:
#Verification of not empty lines
lineOK = df_embed[ (df_embed["book_language"] == "en") & \
                   (df_embed["description_bert"] != '0') & \
                   (df_embed["authorsContext_fastText"] != '0')]
print(len(lineOK))

36013


##### Strings of embeddings transformed into float
    *Some embeddings have been saved as strings, so a specific pre-treatment is needed before using it.*



In [None]:
#Transformation of an embedding line into a vector 
#because embeddings have been saved as a list of strings

MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])

def vectEmbed(l, s1, s2):
  """ 
    Inputs :
      l is the line with embeddings
      s1 is the size of the 1st embedding
      s2 is the size of the 2nd embedding
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #size of the vector returned
  #for example bert embedding of s1 = 768 
  #          + fastText embedding of s2 = 60
  #          + User-ID
  b = np.zeros(s1 + s2 + 1)

  #1st copy of User-ID & normalisation
  b[0] = (l['User-ID']-MIN)/(MAX-MIN)

  #Then handling of list of strings of the 1st embedding
  mi = -24.282314           #when normalisation is performed on embeddings values
  ma = 2.1074116
  v1 = l['description_bert'].split("'")
  b[1:(s1+1)] = [float(v1[i]) for i in range(1, len(v1), 2)]
  #b[1:(s1+1)] = [((float(v1[i]) - mi)/ (ma - mi)) for i in range(1, len(v1), 2)]

  #Then handling of list of strings of the 2nd embedding
  mii = -0.6839812078202764 #when normalisation is performed on embeddings values
  maa = 0.6203103736042976
  v2 = l['authorsContext_fastText'].split("'")
  b[(s1+1):(s1 + s2 + 2)] = [float(v2[i]) for i in range(1, len(v2), 2)]
  #b[(s1+1):(s1 + s2 + 2)] = [((float(v2[i]) - mii)/ (maa - mii)) for i in range(1, len(v2), 2)]

  return b

In [None]:
#treatment of all lines of previously merged in df_embed 

def matEmbed(l, s1, s2, df):
  """
  Inputs :
    l are the df lines to normalize and transform into vector
    s1 is the size of the 1st embedding
    s2 is the size of the 2nd embedding
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example bert embedding of s1 = 768 
  #          + fastText embedding of s2 = 60
  #          + User-ID
  #len(l) is the size of the merged data frame (lines l to handle)   
  m = np.zeros((len(l), (s1 + s2 + 1)))
  
  ind = 0
  for i in l.index:

      m[ind] = vectEmbed(df.iloc[i], s1, s2)
      ind = ind + 1
  
  return m

In [None]:
#It takes about 1 min for 63 264 lines
mat_X_data = matEmbed(lineOK, s1, s2, df_embed)
mat_X_data.shape

(36013, 829)

##### Min / Max of embbedings values

In [None]:
#Bert embedding (1st columns of matrix)

#First I performed the min on each lines corresponding to the book's description embedding
print(mat_X_data[:, 1:(s1+1)].min(axis = 0).shape)

#Then i take the min of this values vector
min(mat_X_data[:, 1:(s1+1)].min(axis = 0))

(768,)


-24.282314

In [None]:
#Same for the max
print(mat_X_data[:, 1:(s1+1)].max(axis = 0).shape)
max(mat_X_data[:, 1:(s1+1)].max(axis = 0))

(768,)


2.1074116

In [None]:
#Same for the median
print(np.median(mat_X_data[:, 1:(s1+1)], axis = 0).shape)
np.median(np.median(mat_X_data[:, 1:(s1+1)], axis = 0))

(768,)


-0.0259018065

In [None]:
#Same for the mean
print(np.mean(mat_X_data[:, 1:(s1+1)], axis = 0).shape)
np.mean(np.mean(mat_X_data[:, 1:(s1+1)], axis = 0))

(768,)


-0.039433989750688965

In [None]:
#Embeddings valus analysis
nb_nb = mat_X_data[:, 1:(s1+1)].shape[0] * mat_X_data[:, 1:(s1+1)].shape[1]
r_m10 = round(100 * np.sum(((mat_X_data[:, 1:(s1+1)] <= -10)).reshape(-1)) / (nb_nb), 2)
r_m10_m5 = round(100 * np.sum(((mat_X_data[:, 1:(s1+1)] > -10) & (mat_X_data[:, 1:(s1+1)] <= -5)).reshape(-1)) / (nb_nb), 2)
r_m5_m1 = round(100 * np.sum(((mat_X_data[:, 1:(s1+1)] > -5) & (mat_X_data[:, 1:(s1+1)] <= -1)).reshape(-1)) / (nb_nb), 2)
r_m1_0 = round(100 * np.sum(((mat_X_data[:, 1:(s1+1)] > -1) & (mat_X_data[:, 1:(s1+1)] <= 0)).reshape(-1)) / (nb_nb), 2)
r_0 = round(100 * np.sum(((mat_X_data[:, 1:(s1+1)] > 0)).reshape(-1)) / (nb_nb), 2)

print("Percentage of embeddings value under -10: \t\t", r_m10)
print("Percentage of embeddings value between -10 and -5: \t", r_m10_m5)
print("Percentage of embeddings value under -5 and -1: \t", r_m5_m1)
print("Percentage of embeddings value under -1 and 0: \t\t", r_m1_0)
print("Percentage of embeddings value above 0: \t\t", r_0)

Percentage of embeddings value under -10: 		 0.13
Percentage of embeddings value between -10 and -5: 	 0.0
Percentage of embeddings value under -5 and -1: 	 0.28
Percentage of embeddings value under -1 and 0: 		 52.1
Percentage of embeddings value above 0: 		 47.49


In [None]:
#FastText embedding (last columns of matrix)

#First i performed the min on each lines corresponding to the author context embedding
print(mat_X_data[:, (s1+1):].min(axis = 0).shape)

#Then i take the min of this values vector
min(mat_X_data[:, (s1+1):].min(axis = 0))

(60,)


-0.6839812078202764

In [None]:
#Same for the max
print(mat_X_data[:, (s1+1):].max(axis = 0).shape)
max(mat_X_data[:, (s1+1):].max(axis = 0))

(60,)


0.6203103736042976

### Desc_FastText_60 || AuthorContext_FastText_60

##### Data merge

In [None]:
#previous embedding performed through "SingleEmbed_OrdinalReg.ipynb"
#embeddings with FASTTEXT of books description
df_fastText_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/Books_description_fastText.csv", sep = "|")
#embeddings with FASTTEXT of ["book_title", "book_author", "Category", "Category_other", "author_genres", "author_genres_other"]
df_fastText_auth_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/authorsContext_fastText.csv", sep = "|")

In [None]:
df_fastText = df_fastText_temp.copy()
df_fastText_auth = df_fastText_auth_temp.copy()

#All lines of "bothWebSites_InternetSearch_AllBooks_cleaned_w2v.csv" have been kept (105 508)
#but when possible, an embedding column has been added
print(len(df_fastText))
print(len(df_fastText_auth))

105508
105508


In [None]:
df_fastText.head(1)

Unnamed: 0,isbn,isbn_13,OtherID,book_title,book_author,year_of_publication,Publisher,Category,Category_other,book_description,book_language,Image,number_of_pages,author_genres,author_genres_other,books_in_series,average_rating,awards,description_fastFM
0,439023483,9780439000000.0,,The Hunger Games : The First Book of the Hunge...,Suzanne Collins,2008-01-01,,juvenile fict,,,en,http://books.google.com/books/content?id=sJdUA...,374.0,,,,,,0


In [None]:
#I only keep the embeddings  (saved as a string)
X1 = df_fastText[["isbn", "book_language", "description_fastFM"]]
s1 = 60

X2 = df_fastText_auth["authorsContext_fastText"]
s2 = 60

#Concatenation (instead of merge) because those embeddings are saved 
#on an additional column of full df_books
df_embed = pd.concat([X1, X2], axis = 1)
print(len(df_embed))
df_embed.head()

105508


Unnamed: 0,isbn,book_language,description_fastFM,authorsContext_fastText
0,439023483,en,0,"['-0.10757246970929005', '-0.09679322073085082..."
1,316015849,en,"['0.046032028402859235', '0.15718470524777384'...","['-0.11777790052996527', '-0.08616064630088306..."
2,525478817,en,"['-0.015599380892056685', '0.34733651177241254...","['-0.11273428555127353', '-0.08976304471012085..."
3,1416524797,en,"['0.09067573049370284', '0.2065578283826736', ...","['-0.11270047592309614', '-0.08497616087940212..."
4,62024035,en,"['-0.006060493551194668', '0.3125178421381861'...","['-0.13137996882200242', '-0.07718185376375913..."


In [None]:
#ratings of Book Crossing
#df_ratings_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/BX-Book-Ratings.csv", sep = ";", error_bad_lines=False, encoding="latin-1")

#Created ratings (from the BookCrossing ones), in a trial to balance classes (see "SingleEmbed_OrdinalReg.ipynb")
df_ratings_temp = pd.read_csv("/content/drive/MyDrive/ProjetML/BX-Book-Ratings-Binary.csv", sep = "|", error_bad_lines=False, encoding="latin-1")

df_ratings = df_ratings_temp.copy()

In [None]:
#Merge between embeddings and ratings dataframe
df_embed = pd.merge(df_ratings, df_embed, left_on = "ISBN", right_on = "isbn")  
print(len(df_embed))
df_embed.head(1)

64148


Unnamed: 0,User-ID,ISBN,Book-Rating,isbn,book_language,description_fastFM,authorsContext_fastText
0,190708,044015393X,0.0,044015393X,en,0,"['-0.11191713949665427', '-0.08290330194202918..."


In [None]:
#Verification of not empty lines
lineOK = df_embed[ (df_embed["book_language"] == "en") & \
                   (df_embed["description_fastFM"] != '0') & \
                   (df_embed["authorsContext_fastText"] != '0')]
print(len(lineOK))

36009


##### Strings of embeddings concatenated into float
    *Some embeddings have been saved as strings, so a specific pre-treatment is needed before using it.*

In [None]:
#Transformation of an embedding line into a vector 
#because embeddings have been saved as a list of strings

MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])

def vectEmbed2(l, s1, s2):
  """ 
    Inputs :
      l is the line with embeddings
      s1 is the size of the 1st embedding
      s2 is the size of the 2nd embedding
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #size of the vector returned
  #for example 1st fastText embedding of s1 = 60 
  #          + 2nd fastText embedding of s2 = 60
  #          + User-ID
  b = np.zeros(s1 + s2 + 1)

  #1st copy of User-ID & normalisation
  b[0] = (l['User-ID']-MIN)/(MAX-MIN)

  #Then handling of list of strings of the 1st embedding
  mi = -1.8926484078168868           #when normalisation is performed on embeddings values
  ma = 2.0181634426116943
  v1 = l['description_fastFM'].split("'")
  b[1:(s1+1)] = [float(v1[i]) for i in range(1, len(v1), 2)]
  #b[1:(s1+1)] = [((float(v1[i]) - mi) / (ma - mi)) for i in range(1, len(v1), 2)]

  #Then handling of list of strings of the 2nd embedding
  mii = -0.6839812078202764          #when normalisation is performed on embeddings values
  maa = 0.6203103736042976
  v2 = l['authorsContext_fastText'].split("'")
  b[(s1+1):(s1 + s2 + 2)] = [float(v2[i]) for i in range(1, len(v2), 2)]
  #b[(s1+1):(s1 + s2 + 2)] = [((float(v2[i]) - mii)/ (maa - mii)) for i in range(1, len(v2), 2)]

  return b

In [None]:
#treatment of all lines of previously merged in df_embed 

def matEmbed2(l, s1, s2, df):
  """
  Inputs :
    l are the df lines to normalize and transform into vector
    s1 is the size of the 1st embedding
    s2 is the size of the 2nd embedding
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example fastText embedding of s1 = 60 
  #          + fastText embedding of s2 = 60
  #          + User-ID
  #len(l) is the size of the merged data frame (lines l to handle)    
  m = np.zeros((len(l), (s1 + s2 + 1)))
  
  ind = 0
  for i in l.index:

      m[ind] = vectEmbed2(df.iloc[i], s1, s2)
      ind = ind + 1
  
  return m

In [None]:
#It takes about 1 min for 63 264 lines
mat_X_data = matEmbed2(lineOK, s1, s2, df_embed)
print(s1, s2, mat_X_data.shape)

60 60 (36009, 121)


##### Min / Max of embbedings values

In [None]:
#FastText on description embedding (1st columns of matrix)

#First I performed the min on each lines corresponding to the book's description embedding
print(mat_X_data[:, 1:(s1+1)].min(axis = 0).shape)

#Then I take the min of this values vector
min(mat_X_data[:, 1:(s1+1)].min(axis = 0))

(60,)


0.0

In [None]:
#Same for the max
print(mat_X_data[:, 1:(s1+1)].max(axis = 0).shape)
max(mat_X_data[:, 1:(s1+1)].max(axis = 0))

(60,)


1.0

In [None]:
#FastText on author context embedding (last columns of matrix)

#First I performed the min on each lines corresponding to the author context embedding
print(mat_X_data[:, (s1+1):].min(axis = 0).shape)

#Then I take the min of this values vector
min(mat_X_data[:, (s1+1):].min(axis = 0))

(60,)


0.0

In [None]:
#Same for the max
print(mat_X_data[:, (s1+1):].max(axis = 0).shape)
max(mat_X_data[:, (s1+1):].max(axis = 0))

(60,)


1.0

##### Strings of embeddings added as float

In [None]:
#Transformation of an embedding line into a vector 
#because embeddings have been saved as a list of strings

MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])

def vectEmbed3(l, s):
  """ 
    Inputs :
      l is the line with embeddings
      s is the size of the both embeddings
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #size of the vector returned
  #for example fastText embedding of s = 60 added with fastText embedding of s = 60
  #          + User-ID
  b_temp = np.zeros(s + 1)
  b = np.zeros(s + 1)

  #1st copy of User-ID & normalisation
  b_temp[0] = (l['User-ID']-MIN)/(MAX-MIN)

  #Then handling of list of strings of the 1st embedding
  mi = -1.8926484078168868           #when normalisation is performed on embeddings values
  ma = 2.0181634426116943
  v1 = l['description_fastFM'].split("'")
  b_temp[1:] = [((float(v1[i]) - mi) / (ma - mi)) for i in range(1, len(v1), 2)]

  #Then handling of list of strings of the 2nd embedding
  mii = -0.6839812078202764           #when normalisation is performed on embeddings values
  maa = 0.6203103736042976
  v2 = l['authorsContext_fastText'].split("'")
  b[1:] = [((float(v2[i]) - mii)/ (maa - mii)) for i in range(1, len(v2), 2)]

  #Real addition (and no more concatenation) of both embeddings
  b[1:] = b[1:] + b_temp[1:]

  return b

In [None]:
#treatment of all lines of previous df_embed 

def matEmbed3(l, s, df):
  """
  Inputs :
    l are the df lines to normalize and transform into vector
    s is the size of the both embeddings
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example fastText embedding of s = 60 added with fastText embedding of s = 60
  #          + User-ID
  #len(l) is the size of the merged data frame (lines l to handle)   
  m = np.zeros((len(l), (s + 1)))
  
  ind = 0
  for i in l.index:
      m[ind] = vectEmbed3(df.iloc[i], s)
      ind = ind + 1
  
  return m

In [None]:
#It takes about 1 min for 63 264 lines
mat_X_data = matEmbed3(lineOK, 60, df_embed)
print(mat_X_data.shape)

(63257, 61)


##### Min / Max of embbedings values

In [None]:
#First I performed the min on each lines corresponding to the book's description embedding
print(mat_X_data[:, 1:].min(axis = 0).shape)

#Then I take the min of this values vector
min(mat_X_data[:, 1:].min(axis = 0))

(60,)


0.5091274849896965

In [None]:
#Same for the max
print(mat_X_data[:, 1:].max(axis = 0).shape)
max(mat_X_data[:, 1:].max(axis = 0))

(60,)


1.5535841137550046

### Incremental concatenation inside author context

##### Embeddings dataframe

In [None]:
#previous embedding performed through "SingleEmbed_OrdinalReg.ipynb"
#embeddings with FASTTEXT of books title
df_fastText_title = pd.read_csv("/content/drive/MyDrive/ProjetML/title_fastText.csv", sep = "|")
#embeddings with FASTTEXT of books author name
df_fastText_authOnly = pd.read_csv("/content/drive/MyDrive/ProjetML/authOnly_fastText.csv", sep = "|")
#embeddings with FASTTEXT of ["Category", "Category_other", "author_genres", "author_genres_other"]
#df_fastText_4cat = pd.read_csv("/content/drive/MyDrive/ProjetML/4cat_fastText.csv", sep = "|")

#embeddings with FASTTEXT of all the 3 previous one 
#df_fastText_auth = pd.read_csv("/content/drive/MyDrive/ProjetML/authorsContext_bis_fastText.csv", sep = "|")

##### Concatenation of 2 embeddings

In [None]:
#Join of 2 embeddings, according to ISBN which is a common column
df_FT_title_authOnly = pd.merge(df_fastText_title, df_fastText_authOnly, on = "isbn") 
df_FT_title_authOnly.head()
len(df_FT_title_authOnly)

180738

In [None]:
#BookCrossing provided ratings
df_ratings = pd.read_csv("/content/drive/MyDrive/ProjetML/BX-Book-Ratings.csv", sep = ";", error_bad_lines=False, encoding="latin-1")

In [None]:
#Merge between embeddings and ratings dataframe
df_embed = pd.merge(df_ratings, df_FT_title_authOnly, left_on = "ISBN", right_on = "isbn")  
print(len(df_embed))
df_embed.head()

82548


Unnamed: 0,User-ID,ISBN,Book-Rating,isbn,c_0_x,c_1_x,c_2_x,c_3_x,c_4_x,c_5_x,c_6_x,c_7_x,c_8_x,c_9_x,c_10_x,c_11_x,c_12_x,c_13_x,c_14_x,c_15_x,c_16_x,c_17_x,c_18_x,c_19_x,c_20_x,c_21_x,c_22_x,c_23_x,c_24_x,c_25_x,c_26_x,c_27_x,c_28_x,c_29_x,c_30_x,c_31_x,c_32_x,c_33_x,c_34_x,c_35_x,...,c_20_y,c_21_y,c_22_y,c_23_y,c_24_y,c_25_y,c_26_y,c_27_y,c_28_y,c_29_y,c_30_y,c_31_y,c_32_y,c_33_y,c_34_y,c_35_y,c_36_y,c_37_y,c_38_y,c_39_y,c_40_y,c_41_y,c_42_y,c_43_y,c_44_y,c_45_y,c_46_y,c_47_y,c_48_y,c_49_y,c_50_y,c_51_y,c_52_y,c_53_y,c_54_y,c_55_y,c_56_y,c_57_y,c_58_y,c_59_y
0,276725,034545104X,0,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
1,2313,034545104X,5,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
2,6543,034545104X,0,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
3,8680,034545104X,5,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
4,10314,034545104X,9,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125


##### Columns selection

In [None]:
col = ["User-ID"] + ["c_" + str(i) + "_x" for i in range(60)] + ["c_" + str(i) + "_y" for i in range(60)]
col

['User-ID',
 'c_0_x',
 'c_1_x',
 'c_2_x',
 'c_3_x',
 'c_4_x',
 'c_5_x',
 'c_6_x',
 'c_7_x',
 'c_8_x',
 'c_9_x',
 'c_10_x',
 'c_11_x',
 'c_12_x',
 'c_13_x',
 'c_14_x',
 'c_15_x',
 'c_16_x',
 'c_17_x',
 'c_18_x',
 'c_19_x',
 'c_20_x',
 'c_21_x',
 'c_22_x',
 'c_23_x',
 'c_24_x',
 'c_25_x',
 'c_26_x',
 'c_27_x',
 'c_28_x',
 'c_29_x',
 'c_30_x',
 'c_31_x',
 'c_32_x',
 'c_33_x',
 'c_34_x',
 'c_35_x',
 'c_36_x',
 'c_37_x',
 'c_38_x',
 'c_39_x',
 'c_40_x',
 'c_41_x',
 'c_42_x',
 'c_43_x',
 'c_44_x',
 'c_45_x',
 'c_46_x',
 'c_47_x',
 'c_48_x',
 'c_49_x',
 'c_50_x',
 'c_51_x',
 'c_52_x',
 'c_53_x',
 'c_54_x',
 'c_55_x',
 'c_56_x',
 'c_57_x',
 'c_58_x',
 'c_59_x',
 'c_0_y',
 'c_1_y',
 'c_2_y',
 'c_3_y',
 'c_4_y',
 'c_5_y',
 'c_6_y',
 'c_7_y',
 'c_8_y',
 'c_9_y',
 'c_10_y',
 'c_11_y',
 'c_12_y',
 'c_13_y',
 'c_14_y',
 'c_15_y',
 'c_16_y',
 'c_17_y',
 'c_18_y',
 'c_19_y',
 'c_20_y',
 'c_21_y',
 'c_22_y',
 'c_23_y',
 'c_24_y',
 'c_25_y',
 'c_26_y',
 'c_27_y',
 'c_28_y',
 'c_29_y',
 'c_30_y',
 'c_31_

In [None]:
  #I just remove the redundant and useless columns after the merge
  X_data = df_embed[col]
  print(len(X_data))
  print(X_data.iloc[0:1, 60])
  print(X_data.iloc[0:1, 61])
  X_data.head(1)

82548
0   -0.487806
Name: c_59_x, dtype: float64
0   -0.415204
Name: c_0_y, dtype: float64


Unnamed: 0,User-ID,c_0_x,c_1_x,c_2_x,c_3_x,c_4_x,c_5_x,c_6_x,c_7_x,c_8_x,c_9_x,c_10_x,c_11_x,c_12_x,c_13_x,c_14_x,c_15_x,c_16_x,c_17_x,c_18_x,c_19_x,c_20_x,c_21_x,c_22_x,c_23_x,c_24_x,c_25_x,c_26_x,c_27_x,c_28_x,c_29_x,c_30_x,c_31_x,c_32_x,c_33_x,c_34_x,c_35_x,c_36_x,c_37_x,c_38_x,...,c_20_y,c_21_y,c_22_y,c_23_y,c_24_y,c_25_y,c_26_y,c_27_y,c_28_y,c_29_y,c_30_y,c_31_y,c_32_y,c_33_y,c_34_y,c_35_y,c_36_y,c_37_y,c_38_y,c_39_y,c_40_y,c_41_y,c_42_y,c_43_y,c_44_y,c_45_y,c_46_y,c_47_y,c_48_y,c_49_y,c_50_y,c_51_y,c_52_y,c_53_y,c_54_y,c_55_y,c_56_y,c_57_y,c_58_y,c_59_y
0,276725,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,0.841204,0.376636,0.940479,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125


##### Creation of mat_X_data by concatenation 

In [None]:
#In order to normalize those User-ID...even if bad idea to keep them
MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])
print(MIN, MAX)

8 278851


In [None]:
#embeddings are already saved as float inside dataframe columns
#here a columns concatenation is performed

def vectfastText(l, s1, s2):
  """ 
    Inputs :
      l is the line with embeddings
      s1 is the size of the 1st embedding
      s2 is the size of the 2nd embedding
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #size of the vector returned
  #for example fastText embedding of s1 = 60 
  #          + fastText embedding of s2 = 60
  #          + User-ID
  b = np.zeros(s1 + s2 + 1)

  #1st copy of User-ID & normalisation
  b[0] = (l[0]-MIN)/(MAX-MIN)

  #Then handling of list of float of the 1st embedding
  b[1:(s1+1)] = [float(l[i]) for i in range(1, s1+1)]
  #Then handling of list of float of the 2nd embedding
  b[(s1+1):(s1 + s2 + 1)] = [float(l[i]) for i in range((s1+1),(s1 + s2 + 1))]

  return b

In [None]:
#treatment of all lines of previously merged in X_data 

def matfastText(l, s1, s2, df):
  """
  Inputs :
    l is the number df lines to normalize and transform into vector
    s1 is the size of the 1st embedding
    s2 is the size of the 2nd embedding
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example 1st fastText embedding of s1 = 60 
  #          + 2nd fastText embedding of s2 = 60
  #          + User-ID
  #l is the size of the merged data frame (lines number)  
  m = np.zeros((l, (s1 + s2 + 1)))
  
  ind = 0
  for i in range(l):
      #all the vectors embeddings are put inside a matrix line
      m[ind] = vectfastText(df.iloc[i], s1, s2)
      ind = ind + 1
  
  return m

In [None]:
mat_X_data = matfastText(len(X_data), 60, 60, X_data)
mat_X_data.shape

(82548, 121)

##### Creation of mat_X_data by addition 

In [None]:
#In order to normalize those User-ID...even if bad idea to keep them
MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])
print(MIN, MAX)

8 278851


In [None]:
#embeddings are already saved as float inside dataframe columns
#here a columns concatenation is performed

def vectfastText(l, s):
  """ 
    Inputs :
      l is the line with embeddings
      s is the size of the both embeddings
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #vector returned is then (b_temp + b), of size fastText embedding of s = 60 + size of User-ID
  b_temp = np.zeros(s + 1)
  b = np.zeros(s + 1)

  #1st copy of User-ID & normalisation
  b[0] = (l[0]-MIN)/(MAX-MIN)

  #Then handling of list of float of the 1st embedding
  b_temp[1:] = [float(l[i]) for i in range(1, s+1)]
  #Then handling of list of float of the 2nd embedding
  b[1:] = [float(l[i]) for i in range((s+1),(2*s + 1))]

  #Real addition (and no more concatenation) of both embeddings
  b[1:] = b[1:] + b_temp[1:]

  return b

In [None]:
#treatment of all lines of previously merged in X_data 

def matfastText(l, s, df):
  """
  Inputs :
    l are the number of df lines to normalize and transform into vector
    s is the size of the both embeddings
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example fastText embedding of s = 60 added with fastText embedding of s = 60
  #          + User-ID
  #l is the size of the merged data frame (lines number)  
  m = np.zeros((l, (s + 1)))
  
  ind = 0
  for i in range(l):
      #all the vectors embeddings are put inside a matrix line
      m[ind] = vectfastText(df.iloc[i], s)
      ind = ind + 1
  
  return m

In [None]:
mat_X_data = matfastText(len(X_data), 60, X_data)
mat_X_data.shape

(82548, 61)

##### Labels selection

In [None]:
#I use again the merged dataset according to ISBN (embeddings and ratings), created before columns selection 
Y_data = df_embed["Book-Rating"]
print(len(Y_data))
Y_data.head()

82548


0    0
1    5
2    0
3    5
4    9
Name: Book-Rating, dtype: int64

In [None]:
#Put on np.ndarray format
def vectLabels(l, df):
  m = np.zeros((l, 1), dtype = int)
  
  ind = 0
  for i in range(l):
      m[ind] = df.iloc[i]
      ind = ind + 1
  
  return m

In [None]:
vect_Y_data = vectLabels(len(Y_data), Y_data)
vect_Y_data.shape

(82548, 1)

### Incremental concatenation inside author context

##### Embeddings dataframe

In [None]:
#previous embedding performed through "SingleEmbed_OrdinalReg.ipynb"
#embeddings with FASTTEXT of books title
df_fastText_title = pd.read_csv("/content/drive/MyDrive/ProjetML/title_fastText.csv", sep = "|")
#embeddings with FASTTEXT of books author name
df_fastText_authOnly = pd.read_csv("/content/drive/MyDrive/ProjetML/authOnly_fastText.csv", sep = "|")
#embeddings with FASTTEXT of ["Category", "Category_other", "author_genres", "author_genres_other"]
df_fastText_4cat = pd.read_csv("/content/drive/MyDrive/ProjetML/4cat_fastText.csv", sep = "|")

#embeddings with FASTTEXT of all the 3 previous one 
#df_fastText_auth = pd.read_csv("/content/drive/MyDrive/ProjetML/authorsContext_bis_fastText.csv", sep = "|")

In [None]:
#BookCrossing provided ratings
df_ratings = pd.read_csv("/content/drive/MyDrive/ProjetML/BX-Book-Ratings.csv", sep = ";", error_bad_lines=False, encoding="latin-1")

##### Concatenation of 3 embeddings

In [None]:
#Merge of the 2 first embeddings dataframe, according to ISBN which is a common column
df_temp = pd.merge(df_fastText_title, df_fastText_authOnly, on = "isbn")
print(len(df_temp))
df_temp.head(1)

180738


Unnamed: 0,isbn,c_0_x,c_1_x,c_2_x,c_3_x,c_4_x,c_5_x,c_6_x,c_7_x,c_8_x,c_9_x,c_10_x,c_11_x,c_12_x,c_13_x,c_14_x,c_15_x,c_16_x,c_17_x,c_18_x,c_19_x,c_20_x,c_21_x,c_22_x,c_23_x,c_24_x,c_25_x,c_26_x,c_27_x,c_28_x,c_29_x,c_30_x,c_31_x,c_32_x,c_33_x,c_34_x,c_35_x,c_36_x,c_37_x,c_38_x,...,c_20_y,c_21_y,c_22_y,c_23_y,c_24_y,c_25_y,c_26_y,c_27_y,c_28_y,c_29_y,c_30_y,c_31_y,c_32_y,c_33_y,c_34_y,c_35_y,c_36_y,c_37_y,c_38_y,c_39_y,c_40_y,c_41_y,c_42_y,c_43_y,c_44_y,c_45_y,c_46_y,c_47_y,c_48_y,c_49_y,c_50_y,c_51_y,c_52_y,c_53_y,c_54_y,c_55_y,c_56_y,c_57_y,c_58_y,c_59_y
0,439023483,-0.005422,0.660218,-0.328451,0.196469,-0.208463,-0.739556,0.003857,2.9e-05,-0.10831,-0.188837,0.121907,0.466777,-0.005632,0.108326,0.507619,-0.347326,-0.440195,0.084967,0.02898,0.452554,0.684329,0.005698,-0.028753,0.099497,-0.118867,-0.642309,-0.550026,-0.003231,-0.198943,-6.8e-05,0.006332,0.71241,-0.290316,-0.390195,0.938969,-0.497423,0.08414,0.802105,0.138267,...,0.75802,1.916958,-1.282348,0.788074,-0.994454,1.112676,0.297506,-2.641,-0.710001,2.184157,0.040942,0.463229,2.292549,0.579022,0.785905,-1.379118,-0.924367,-0.415781,1.567474,1.657033,-0.720935,-1.475425,0.631479,-0.697283,0.256653,-1.832851,0.873207,-1.199664,0.986724,0.883189,0.239872,-0.122116,-1.573812,1.441412,0.696378,-0.541776,0.127434,0.005944,0.862448,0.026934


In [None]:
#Merge between embeddings and ratings dataframe 
#=> Performed before merging the 3rd embedding dataframe
#=> in order to reduce the number of lines (too much RAM used in Colab else)
df_temp = pd.merge(df_ratings, df_temp, left_on = "ISBN", right_on = "isbn")  
print(len(df_temp))
df_temp.head()

82548


Unnamed: 0,User-ID,ISBN,Book-Rating,isbn,c_0_x,c_1_x,c_2_x,c_3_x,c_4_x,c_5_x,c_6_x,c_7_x,c_8_x,c_9_x,c_10_x,c_11_x,c_12_x,c_13_x,c_14_x,c_15_x,c_16_x,c_17_x,c_18_x,c_19_x,c_20_x,c_21_x,c_22_x,c_23_x,c_24_x,c_25_x,c_26_x,c_27_x,c_28_x,c_29_x,c_30_x,c_31_x,c_32_x,c_33_x,c_34_x,c_35_x,...,c_20_y,c_21_y,c_22_y,c_23_y,c_24_y,c_25_y,c_26_y,c_27_y,c_28_y,c_29_y,c_30_y,c_31_y,c_32_y,c_33_y,c_34_y,c_35_y,c_36_y,c_37_y,c_38_y,c_39_y,c_40_y,c_41_y,c_42_y,c_43_y,c_44_y,c_45_y,c_46_y,c_47_y,c_48_y,c_49_y,c_50_y,c_51_y,c_52_y,c_53_y,c_54_y,c_55_y,c_56_y,c_57_y,c_58_y,c_59_y
0,276725,034545104X,0,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
1,2313,034545104X,5,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
2,6543,034545104X,0,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
3,8680,034545104X,5,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125
4,10314,034545104X,9,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.18705,-0.891561,-0.876275,-0.983166,-0.250272,0.868906,1.077592,0.327414,-0.617437,2.31233,-0.183084,-1.620542,0.157411,-1.142717,-0.209083,0.816479,-2.794602,0.585446,-0.835561,-1.639691,-0.256602,-2.241315,0.795281,1.976181,0.703924,-1.230887,1.918138,-0.501744,-0.649462,0.933241,-0.099804,1.314793,-0.452294,0.924108,0.718397,1.856,0.174624,2.106044,2.6171,-1.796125


In [None]:
#Merge with the 3rd embedding dataframe, according to ISBN which is a common column
df_embed = pd.merge(df_temp, df_fastText_4cat, on = "isbn")
print(len(df_embed))

df_embed.head()

80770


Unnamed: 0,User-ID,ISBN,Book-Rating,isbn,c_0_x,c_1_x,c_2_x,c_3_x,c_4_x,c_5_x,c_6_x,c_7_x,c_8_x,c_9_x,c_10_x,c_11_x,c_12_x,c_13_x,c_14_x,c_15_x,c_16_x,c_17_x,c_18_x,c_19_x,c_20_x,c_21_x,c_22_x,c_23_x,c_24_x,c_25_x,c_26_x,c_27_x,c_28_x,c_29_x,c_30_x,c_31_x,c_32_x,c_33_x,c_34_x,c_35_x,...,c_20,c_21,c_22,c_23,c_24,c_25,c_26,c_27,c_28,c_29,c_30,c_31,c_32,c_33,c_34,c_35,c_36,c_37,c_38,c_39,c_40,c_41,c_42,c_43,c_44,c_45,c_46,c_47,c_48,c_49,c_50,c_51,c_52,c_53,c_54,c_55,c_56,c_57,c_58,c_59
0,276725,034545104X,0,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.156207,0.930938,-0.266026,0.435993,-0.340965,0.015139,0.113814,-0.166438,0.510091,0.12864,-0.385744,-0.152725,0.025504,0.024193,-0.07108,-0.34001,0.375992,0.312524,0.203655,0.147381,-0.155797,0.500863,0.408706,0.462043,-0.123094,0.224132,-0.228706,0.204171,-0.62786,0.254188,0.027756,0.242627,0.177675,0.073415,-0.137022,0.481935,-0.378283,0.319737,-0.019637,0.476078
1,2313,034545104X,5,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.156207,0.930938,-0.266026,0.435993,-0.340965,0.015139,0.113814,-0.166438,0.510091,0.12864,-0.385744,-0.152725,0.025504,0.024193,-0.07108,-0.34001,0.375992,0.312524,0.203655,0.147381,-0.155797,0.500863,0.408706,0.462043,-0.123094,0.224132,-0.228706,0.204171,-0.62786,0.254188,0.027756,0.242627,0.177675,0.073415,-0.137022,0.481935,-0.378283,0.319737,-0.019637,0.476078
2,6543,034545104X,0,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.156207,0.930938,-0.266026,0.435993,-0.340965,0.015139,0.113814,-0.166438,0.510091,0.12864,-0.385744,-0.152725,0.025504,0.024193,-0.07108,-0.34001,0.375992,0.312524,0.203655,0.147381,-0.155797,0.500863,0.408706,0.462043,-0.123094,0.224132,-0.228706,0.204171,-0.62786,0.254188,0.027756,0.242627,0.177675,0.073415,-0.137022,0.481935,-0.378283,0.319737,-0.019637,0.476078
3,8680,034545104X,5,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.156207,0.930938,-0.266026,0.435993,-0.340965,0.015139,0.113814,-0.166438,0.510091,0.12864,-0.385744,-0.152725,0.025504,0.024193,-0.07108,-0.34001,0.375992,0.312524,0.203655,0.147381,-0.155797,0.500863,0.408706,0.462043,-0.123094,0.224132,-0.228706,0.204171,-0.62786,0.254188,0.027756,0.242627,0.177675,0.073415,-0.137022,0.481935,-0.378283,0.319737,-0.019637,0.476078
4,10314,034545104X,9,034545104X,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,...,0.156207,0.930938,-0.266026,0.435993,-0.340965,0.015139,0.113814,-0.166438,0.510091,0.12864,-0.385744,-0.152725,0.025504,0.024193,-0.07108,-0.34001,0.375992,0.312524,0.203655,0.147381,-0.155797,0.500863,0.408706,0.462043,-0.123094,0.224132,-0.228706,0.204171,-0.62786,0.254188,0.027756,0.242627,0.177675,0.073415,-0.137022,0.481935,-0.378283,0.319737,-0.019637,0.476078


In [None]:
print("Ratings columns (nb = 3)")
print(df_embed.columns[0:3])

print("\nisbn column used for the merged (nb = 1)")
print(df_embed.columns[3:4])

print("\n1st embedding (nb = 60)")
print(df_embed.columns[4:64])

print("\n2nd embedding (nb = 60)")
print(df_embed.columns[64:124])

print("\n3rd embedding (nb = 64)")
print(df_embed.columns[124:184])

print("\nSo 3 + 1 3*60 = 184 columns")

Ratings columns (nb = 3)
Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

isbn column used for the merged (nb = 1)
Index(['isbn'], dtype='object')

1st embedding (nb = 60)
Index(['c_0_x', 'c_1_x', 'c_2_x', 'c_3_x', 'c_4_x', 'c_5_x', 'c_6_x', 'c_7_x',
       'c_8_x', 'c_9_x', 'c_10_x', 'c_11_x', 'c_12_x', 'c_13_x', 'c_14_x',
       'c_15_x', 'c_16_x', 'c_17_x', 'c_18_x', 'c_19_x', 'c_20_x', 'c_21_x',
       'c_22_x', 'c_23_x', 'c_24_x', 'c_25_x', 'c_26_x', 'c_27_x', 'c_28_x',
       'c_29_x', 'c_30_x', 'c_31_x', 'c_32_x', 'c_33_x', 'c_34_x', 'c_35_x',
       'c_36_x', 'c_37_x', 'c_38_x', 'c_39_x', 'c_40_x', 'c_41_x', 'c_42_x',
       'c_43_x', 'c_44_x', 'c_45_x', 'c_46_x', 'c_47_x', 'c_48_x', 'c_49_x',
       'c_50_x', 'c_51_x', 'c_52_x', 'c_53_x', 'c_54_x', 'c_55_x', 'c_56_x',
       'c_57_x', 'c_58_x', 'c_59_x'],
      dtype='object')

2nd embedding (nb = 60)
Index(['c_0_y', 'c_1_y', 'c_2_y', 'c_3_y', 'c_4_y', 'c_5_y', 'c_6_y', 'c_7_y',
       'c_8_y', 'c_9_y', 'c_10_y', 'c_

##### Columns selection

In [None]:
col = ["User-ID"] + ["c_" + str(i) + "_x" for i in range(60)] + ["c_" + str(i) + "_y" for i in range(60)] + ["c_" + str(i) for i in range(60)]
col

['User-ID',
 'c_0_x',
 'c_1_x',
 'c_2_x',
 'c_3_x',
 'c_4_x',
 'c_5_x',
 'c_6_x',
 'c_7_x',
 'c_8_x',
 'c_9_x',
 'c_10_x',
 'c_11_x',
 'c_12_x',
 'c_13_x',
 'c_14_x',
 'c_15_x',
 'c_16_x',
 'c_17_x',
 'c_18_x',
 'c_19_x',
 'c_20_x',
 'c_21_x',
 'c_22_x',
 'c_23_x',
 'c_24_x',
 'c_25_x',
 'c_26_x',
 'c_27_x',
 'c_28_x',
 'c_29_x',
 'c_30_x',
 'c_31_x',
 'c_32_x',
 'c_33_x',
 'c_34_x',
 'c_35_x',
 'c_36_x',
 'c_37_x',
 'c_38_x',
 'c_39_x',
 'c_40_x',
 'c_41_x',
 'c_42_x',
 'c_43_x',
 'c_44_x',
 'c_45_x',
 'c_46_x',
 'c_47_x',
 'c_48_x',
 'c_49_x',
 'c_50_x',
 'c_51_x',
 'c_52_x',
 'c_53_x',
 'c_54_x',
 'c_55_x',
 'c_56_x',
 'c_57_x',
 'c_58_x',
 'c_59_x',
 'c_0_y',
 'c_1_y',
 'c_2_y',
 'c_3_y',
 'c_4_y',
 'c_5_y',
 'c_6_y',
 'c_7_y',
 'c_8_y',
 'c_9_y',
 'c_10_y',
 'c_11_y',
 'c_12_y',
 'c_13_y',
 'c_14_y',
 'c_15_y',
 'c_16_y',
 'c_17_y',
 'c_18_y',
 'c_19_y',
 'c_20_y',
 'c_21_y',
 'c_22_y',
 'c_23_y',
 'c_24_y',
 'c_25_y',
 'c_26_y',
 'c_27_y',
 'c_28_y',
 'c_29_y',
 'c_30_y',
 'c_31_

In [None]:
  #I just remove the redundant and useless columns after the merge
  X_data = df_embed[col]
  print(X_data.shape)
  X_data.head(1)

(80770, 181)


Unnamed: 0,User-ID,c_0_x,c_1_x,c_2_x,c_3_x,c_4_x,c_5_x,c_6_x,c_7_x,c_8_x,c_9_x,c_10_x,c_11_x,c_12_x,c_13_x,c_14_x,c_15_x,c_16_x,c_17_x,c_18_x,c_19_x,c_20_x,c_21_x,c_22_x,c_23_x,c_24_x,c_25_x,c_26_x,c_27_x,c_28_x,c_29_x,c_30_x,c_31_x,c_32_x,c_33_x,c_34_x,c_35_x,c_36_x,c_37_x,c_38_x,...,c_20,c_21,c_22,c_23,c_24,c_25,c_26,c_27,c_28,c_29,c_30,c_31,c_32,c_33,c_34,c_35,c_36,c_37,c_38,c_39,c_40,c_41,c_42,c_43,c_44,c_45,c_46,c_47,c_48,c_49,c_50,c_51,c_52,c_53,c_54,c_55,c_56,c_57,c_58,c_59
0,276725,-1.130464,0.148696,-1.306536,0.889705,-0.267956,-0.773415,0.114773,0.623024,-1.089605,-0.403646,-0.149804,1.213949,0.255087,0.140718,-0.329618,0.422306,0.329065,0.11583,-0.307084,0.32585,0.111062,-0.618446,-0.978312,0.047283,0.505623,-1.034488,-0.517935,0.479253,-0.094448,0.124497,0.804977,0.840808,-0.493627,-0.268002,0.861804,0.085292,0.841204,0.376636,0.940479,...,0.156207,0.930938,-0.266026,0.435993,-0.340965,0.015139,0.113814,-0.166438,0.510091,0.12864,-0.385744,-0.152725,0.025504,0.024193,-0.07108,-0.34001,0.375992,0.312524,0.203655,0.147381,-0.155797,0.500863,0.408706,0.462043,-0.123094,0.224132,-0.228706,0.204171,-0.62786,0.254188,0.027756,0.242627,0.177675,0.073415,-0.137022,0.481935,-0.378283,0.319737,-0.019637,0.476078


##### Creation of mat_X_data by concatenation 

In [None]:
#In order to normalize those User-ID...even if bad idea to keep them
MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])
print(MIN, MAX)

8 278851


In [None]:
#embeddings are already saved as float inside dataframe columns
#here a columns concatenation is performed

def vectfastText(l, s1, s2, s3):
  """ 
    Inputs :
      l is the line with embeddings
      s1 is the size of the 1st embedding
      s2 is the size of the 2nd embedding
      s3 is the size of the 3rd embedding
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #size of the vector returned
  #for example fastText embedding of s1 = 60 
  #          + fastText embedding of s2 = 60
  #          + fastText embedding of s3 = 60
  #          + User-ID
  b = np.zeros(s1 + s2 + s3 + 1)

  #1st copy of User-ID & normalisation
  b[0] = (l[0]-MIN)/(MAX-MIN)

  #Then handling of list of float of the 1st embedding
  b[1:(s1 + 1)] = [float(l[i]) for i in range(1, s1 + 1)]
  #Then handling of list of float of the 2nd embedding
  b[(s1 + 1):(s1 + s2 + 1)] = [float(l[i]) for i in range((s1+1),(s1 + s2 + 1))]
  #Then handling of list of float of the 3rd embedding
  b[(s1 + s2 + 1):(s1 + s2 + s3 + 1)] = [float(l[i]) for i in range((s1 + s2 + 1),(s1 + s2 + s3 + 1))]

  return b

In [None]:
#treatment of all lines of previously merged in X_data 

def matfastText(l, s1, s2, s3, df):
  """
  Inputs :
    l are the number of df lines to normalize and transform into vector
    s1 is the size of the 1st embedding
    s2 is the size of the 2nd embedding
    s3 is the size of the 3rd embedding
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example 1st fastText embedding of s1 = 60 
  #          + 2nd fastText embedding of s2 = 60
  #          + 3rd fastText embedding of s3 = 60
  #          + User-ID
  #l is the size of the merged data frame (lines number)
  m = np.zeros((l, (s1 + s2 + s3 + 1)))
  
  ind = 0
  for i in range(l):
      #all the vectors embeddings are put inside a matrix line
      m[ind] = vectfastText(df.iloc[i], s1, s2, s3)
      ind = ind + 1
  
  return m

In [None]:
mat_X_data = matfastText(len(X_data), 60, 60, 60, X_data)
mat_X_data.shape

(80770, 181)

In [None]:
mat_X_data.shape

(80770, 181)

##### Creation of mat_X_data by addition 

In [None]:
#In order to normalize those User-ID...even if bad idea to keep them
MIN = min(df_embed["User-ID"])
MAX = max(df_embed["User-ID"])
print(MIN, MAX)

8 278851


In [None]:
#embeddings are already saved as float inside dataframe columns
#here a columns concatenation is performed

def vectfastText(l, s):
  """ 
    Inputs :
      l is the line with embeddings
      s is the size of the both embeddings
    Output : the same current line, but with :
      User-ID normalized
      a vector format
  """

  #vector returned is then (b_temp1 + b_temp2 + b), of size fastText embedding of s = 60 + size of User-ID
  b_temp1 = np.zeros(s + 1)
  b_temp2 = np.zeros(s + 1)
  b = np.zeros(s + 1)

  #1st copy of User-ID & normalisation
  b[0] = (l[0]-MIN)/(MAX-MIN)

  #Then handling of list of float of the 1st embedding
  b_temp1[1:] = [float(l[i]) for i in range(1, s+1)]
  #Then handling of list of float of the 2nd embedding
  b_temp2[1:] = [float(l[i]) for i in range((s+1),(2*s + 1))]
  #Then handling of list of float of the 3rd embedding
  b[1:]       = [float(l[i]) for i in range((2*s + 1),(3*s + 1))]
  
  #Real addition (and no more concatenation) of both embeddings
  b[1:] = b[1:] + b_temp1[1:] + b_temp2[1:]

  return b

In [None]:
#treatment of all lines of previously merged in X_data 

def matfastText(l, s, df):
  """
  Inputs :
    l are the number of df lines to normalize and transform into vector
    s is the size of the 3 embeddings
    df is the merged dataframe (User-ID & embeddings)
  Output : the same df, but with 
    User-ID normalized
    Matrix format
  """

  #number of columns of the matrix returned
  #for example fastText embedding of s = 60 added with 2 other fastText embeddings of s = 60
  #          + User-ID
  #l is the size of the merged data frame (lines number)
  m = np.zeros((l, (s + 1)))
  
  ind = 0
  for i in range(l):
      #all the vectors embeddings are put inside a matrix line
      m[ind] = vectfastText(df.iloc[i], s)
      ind = ind + 1
  
  return m

In [None]:
mat_X_data = matfastText(len(X_data), 60, X_data)
mat_X_data.shape

(80770, 61)

##### Labels selection

In [None]:
#I use again the merged dataset according to ISBN (embeddings and ratings), created before columns selection 
Y_data = df_embed["Book-Rating"]
print(len(Y_data))
Y_data.head()

80770


0    0
1    5
2    0
3    5
4    9
Name: Book-Rating, dtype: int64

In [None]:
#Put on np.ndarray format
def vectLabels(l, df):
  m = np.zeros((l, 1), dtype = int)
  
  ind = 0
  for i in range(l):
      m[ind] = df.iloc[i]
      ind = ind + 1
  
  return m

In [None]:
vect_Y_data = vectLabels(len(Y_data), Y_data)
vect_Y_data.shape

(80770, 1)

### Selection of corresponding labels

In [None]:
#I use again the merged dataset according to ISBN (embeddings and ratings), created before columns selection 
Y_data = df_embed["Book-Rating"]
print(len(Y_data))
Y_data.head()

64148


0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Book-Rating, dtype: float64

In [None]:
#treatment of all corresponding lines of Y_data
#=> same lines as the one used to create mat_X_data

def vectLabels(l, df):
  m = np.zeros((len(l), 1), dtype = int)
  
  ind = 0
  for i in l.index:
      m[ind] = df.iloc[i]
      ind = ind + 1
  
  return m

In [None]:
vect_Y_data = vectLabels(lineOK, Y_data)
vect_Y_data.shape

(36009, 1)

# Train / Test split

### Classical split

In [None]:
#mat_X_data: matrix obtained after 
#               ratings & embeddings merge -> embeddings concatenation 
#vect_Y_data: labels transformed in matrix format     
(X_train, X_test, y_train, y_test) = train_test_split(mat_X_data, vect_Y_data,          
                                           test_size=.2, stratify = vect_Y_data)

In [None]:
print("X_train.shape", X_train.shape)
print("X_test.shape", X_test.shape, "\n")

print("y_train.shape", y_train.shape)
print("y_test.shape", y_test.shape)

X_train.shape (64616, 61)
X_test.shape (16154, 61) 

y_train.shape (64616, 1)
y_test.shape (16154, 1)


### imblearn

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import RandomUnderSampler



In [None]:
Counter(np.squeeze(y_train))

Counter({0: 40023,
         1: 93,
         2: 158,
         3: 350,
         4: 486,
         5: 2681,
         6: 2018,
         7: 4244,
         8: 5913,
         9: 3740,
         10: 4910})

In [None]:
#Test 1 : I try to manually put the new classes size

.#New number of elements per classes: only over sampled classes
strategy = {1:3000, 2:3000, 3:3000, 4:3000, 5:3000, 6:3000}

over_under_sample = SMOTE(sampling_strategy = strategy)
X_smo, y_smo = over_under_sample.fit_resample(X_train, y_train)

strategy = {0:3000, 7:3000, 8:3000, 9: 3000, 10:3000}
undersample = RandomUnderSampler(sampling_strategy = strategy)
X_smo_c, y_smo_c = undersample.fit_resample(X_smo, y_smo)

  y = column_or_1d(y, warn=True)


In [None]:
Counter(np.squeeze(y_smo_c))

Counter({0: 3000,
         1: 3000,
         2: 3000,
         3: 3000,
         4: 3000,
         5: 3000,
         6: 3000,
         7: 3000,
         8: 3000,
         9: 3000,
         10: 3000})

# Ordinal regression

### Loading
2 models are tested :
*   LogisticAT : ordinal regression all-threshold  
*   OrdinalRidge : linear regression with L2 penalization, but the loss function is minus absolute mean (no more Least Square) 

In [None]:
!pip install mord
from mord import LogisticAT, OrdinalRidge



### Training / Test

In [None]:
Counter(np.squeeze(y_train))

Counter({0: 40023,
         1: 94,
         2: 157,
         3: 350,
         4: 486,
         5: 2681,
         6: 2018,
         7: 4244,
         8: 5913,
         9: 3740,
         10: 4910})

In [None]:
#Model TRAINING

#It takes about 35 min for desc_bert_768 || AuthorContext_FastText_60
#X_smo_c (imblearn(X_train)), y_smo_c (imblearn(y_train)) are the 1st test to balance classes

LAT = LogisticAT(alpha=0., max_iter = 10000)
LAT.fit(X_smo_c, np.squeeze(y_smo_c))

ORD = OrdinalRidge(alpha=0., max_iter = 10000)
ORD.fit(X_smo_c, np.squeeze(y_smo_c))

OrdinalRidge(alpha=0.0, copy_X=True, fit_intercept=True, max_iter=10000,
             normalize=False, random_state=None, solver='auto', tol=0.001)

In [None]:
Counter(np.squeeze(y_test))

Counter({0: 10006,
         1: 24,
         2: 39,
         3: 87,
         4: 122,
         5: 670,
         6: 505,
         7: 1061,
         8: 1478,
         9: 935,
         10: 1227})

In [None]:
#Model VALIDATION

y_pred_lat = LAT.predict(X_test)
y_pred_ord = ORD.predict(X_test)

### Unbalanced ratings

In [None]:
def ConfMat(y_pred, y_true):
  """ misclassified elements for each classes """

  #array with the number of misclassified elements for each classes
  compResTab = np.zeros(11)

  for i in range (len(y_true)):
    #The correct classe is y_test[i], so I increment its counter of misclassified if y_lin_pred[i] is not the same
    compResTab[int(y_true[i])] = compResTab[int(y_true[i])] + (y_pred[i] != y_true[i])
  
  #Total number of element inside each classes
  p = Counter(np.squeeze(y_true))
  
  #Display of the misclassified number / percentage for each classes
  print("Class \tMisclassified \t    Percentage")
  for i in range(11):
     print("{0}\t{1}\t\t\t{2}".format(i, compResTab[i], round(100*(compResTab[i]/p[i]), 2)))
  print("\nTOTAL\t{0}\t\t\t{1}".format(compResTab.sum(), round(100*compResTab.sum()/len(y_true),2)))

In [None]:
#LogisticAT model

print(Counter(y_pred_lat))

#indexes of misclassified elements
compRes_lat = [i for i in range(len(y_test)) if y_pred_lat[i] != y_test[i]]
print("Percentage of misclassified (LogisticAT): ", (len(compRes_lat) / len(y_test))*100)

#Percentage of misclassified elements inside validation set, for each classes
ConfMat(y_pred_lat, y_test)

Counter({5: 7562, 6: 4033, 4: 3521, 7: 784, 3: 227, 8: 25, 2: 2})
Percentage of misclassified (LogisticAT):  96.71288844868144
Class 	Misclassified 	    Percentage
0	10006.0			100.0
1	24.0			100.0
2	39.0			100.0
3	87.0			100.0
4	96.0			78.69
5	365.0			54.48
6	353.0			69.9
7	1015.0			95.66
8	1476.0			99.86
9	935.0			100.0
10	1227.0			100.0

TOTAL	15623.0			96.71


In [None]:
#OrdinalRidge model

print(Counter(y_pred_ord))

#indexes of misclassified elements
compRes_ord = [i for i in range(len(y_test)) if y_pred_ord[i] != y_test[i]]
print("Percentage of misclassified (OrdinalRidge): ", (len(compRes_ord) / len(y_test))*100)

#Percentage of misclassified elements inside validation set, for each classes
ConfMat(y_pred_ord, y_test)

Counter({5.0: 10508, 6.0: 3198, 4.0: 2339, 7.0: 96, 3.0: 13})
Percentage of misclassified (OrdinalRidge):  96.44670050761421
Class 	Misclassified 	    Percentage
0	10006.0			100.0
1	24.0			100.0
2	39.0			100.0
3	87.0			100.0
4	103.0			84.43
5	237.0			35.37
6	389.0			77.03
7	1055.0			99.43
8	1478.0			100.0
9	935.0			100.0
10	1227.0			100.0

TOTAL	15580.0			96.45


### Binary classes

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
#LogisticAT model

#Confusion matrix
confusion_matrix_lat = confusion_matrix( y_pred_lat, y_test )
print(confusion_matrix_lat)

#confusion matrix, in a pourcentage point of view
print("Confusion matrix (LogisticAT) :", 100*(confusion_matrix_lat/len(y_test)), sep = "\n")

print("Percentage of misclassified (LogisticAT): ", ((confusion_matrix_lat[0,1] + confusion_matrix_lat[1,0]) / len(y_test))*100)

[[1891 1466]
 [1670 2175]]
Confusion matrix (LogisticAT) :
[[26.25659539 20.35545682]
 [23.18800333 30.19994446]]
Percentage of misclassified (LogisticAT):  43.54346014995835


In [None]:
#OrdinalRidge model

#Confusion matrix
confusion_matrix_ord = confusion_matrix( y_pred_ord, y_test )
print(confusion_matrix_ord)

#confusion matrix, in a pourcentage point of view
print("Confusion matrix (OrdinalRidge) :", 100*(confusion_matrix_ord/len(y_test)), sep = "\n")

print("Percentage of misclassified (OrdinalRidge): ", ((confusion_matrix_ord[0,1] + confusion_matrix_ord[1,0]) / len(y_test))*100)

[[1892 1460]
 [1669 2181]]
Confusion matrix (OrdinalRidge) :
[[26.27048042 20.27214663]
 [23.1741183  30.28325465]]
Percentage of misclassified (OrdinalRidge):  43.44626492640933


# Classical regression

In [None]:
from sklearn.linear_model import LinearRegression

### Unbalanced ratings

In [None]:
#Custom round function to find the closest int from the provided float
def MyRound(x): 
  return math.ceil(x) if math.ceil(x) - x < x - math.floor(x) else math.floor(x)

In [None]:
#Model TRAINING

#X_smo_c (imblearn(X_train)), y_smo_c (imblearn(y_train)) are the 1st test to balance classes
model_LinReg = LinearRegression()
model_LinReg.fit(X_smo_c, np.squeeze(y_smo_c))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [None]:
#Model VALIDATION

y_lin_pred = model_LinReg.predict(X_test)

In [None]:
#Regression model prediction values are rounded 
y_lin_pred = list(map(MyRound, y_lin_pred))
print(Counter(y_lin_pred))

#indexes of misclassified elements
compRes_lin = [i for i in range(len(y_test)) if y_lin_pred[i] != y_test[i]]
print("Percentage of misclassified (LinearRegression): ", (len(compRes_lin) / len(y_test))*100)

#Percentage of misclassified elements inside validation set, for each classes
ConfMat(y_lin_pred, y_test)

Counter({5: 10508, 6: 3198, 4: 2339, 7: 96, 3: 13})
Percentage of misclassified (LinearRegression):  96.44670050761421
Class 	Misclassified 	    Percentage
0	10006.0			100.0
1	24.0			100.0
2	39.0			100.0
3	87.0			100.0
4	103.0			84.43
5	237.0			35.37
6	389.0			77.03
7	1055.0			99.43
8	1478.0			100.0
9	935.0			100.0
10	1227.0			100.0

TOTAL	15580.0			96.45


# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

### Training / Test

In [None]:
#Model TRAINING

#It takes about 3 min for desc_bert_768 || AuthorContext_FastText_60
#X_smo_c (imblearn(X_train)), y_smo_c (imblearn(y_train)) are the 1st test to balance classes
modele_rf = RandomForestClassifier()
modele_rf.fit(X_smo_c, np.squeeze(y_smo_c))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
#Model VALIDATION

y_predict_rf = modele_rf.predict(X_test)

### Unbalanced ratings

In [None]:
print(Counter(y_predict_rf))

#indexes of misclassified elements
compRes_rf = [i for i in range(len(y_test)) if y_predict_rf[i] != y_test[i]]
print("Percentage of misclassified (RandomForestClassifier): ", (len(compRes_rf) / len(y_test))*100)

#Percentage of misclassified elements inside validation set, for each classes
ConfMat(y_predict_rf, y_test)

Counter({0: 2447, 8: 2136, 7: 2097, 5: 2018, 10: 2000, 9: 1999, 6: 1745, 4: 749, 3: 579, 2: 256, 1: 128})
Percentage of misclassified (RandomForestClassifier):  84.2763402253312
Class 	Misclassified 	    Percentage
0	8416.0			84.11
1	22.0			91.67
2	37.0			94.87
3	85.0			97.7
4	117.0			95.9
5	556.0			82.99
6	450.0			89.11
7	912.0			85.96
8	1251.0			84.64
9	790.0			84.49
10	978.0			79.71

TOTAL	13614.0			84.28


### Binary classes

In [None]:
#confusion matrix
confusion_matrix_rf = confusion_matrix( y_predict_rf, y_test)
print(confusion_matrix_rf)

#confusion matrix, in a pourcentage point of view
print("Confusion matrix (RandomForestClassifier) :", 100*(confusion_matrix_rf/len(y_test)), sep = "\n")

print("Percentage of misclassified (RandomForestClassifier): ", ((confusion_matrix_rf[0,1] + confusion_matrix_rf[1,0]) / len(y_test))*100)

[[1799 1579]
 [1762 2062]]
Confusion matrix (RandomForestClassifier) :
[[24.97917245 21.92446543]
 [24.46542627 28.63093585]]
Percentage of misclassified (RandomForestClassifier):  46.3898916967509


# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
#no convergence with default solver ’lbfgs’ (even with max_iter = 10 000)
model_LogReg = LogisticRegression(max_iter = 10000, solver = "liblinear")

In [None]:
#Model TRAINING

model_LogReg.fit(X_train, np.squeeze(y_train))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=10000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [None]:
#Model VALIDATION
y_predict_LogReg = model_LogReg.predict(X_test)

In [None]:
#confusion matrix
confusion_matrix_LogReg = confusion_matrix( y_predict_LogReg, y_test)
print(confusion_matrix_LogReg)

#confusion matrix, in a pourcentage point of view
print("Confusion matrix (LogisticRegression) :", 100*(confusion_matrix_LogReg/len(y_test)), sep = "\n")

print("Percentage of misclassified (LogisticRegression): ", ((confusion_matrix_LogReg[0,1] + confusion_matrix_LogReg[1,0]) / len(y_test))*100)

[[1842 1445]
 [1719 2196]]
Confusion matrix (LogisticRegression) :
[[25.57622883 20.06387115]
 [23.8683699  30.49153013]]
Percentage of misclassified (LogisticRegression):  43.9322410441544
