**Proyecto**

### 1. Analisis Exploratorio

In [66]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomTreesEmbedding
#from sklearn.neural_network import BernoulliRBM

In [None]:
#Read data
df0 = pd.read_csv("books.csv",error_bad_lines=False)
df0.head()

In [None]:
df0.describe()

In [74]:
#Eliminar espacios de los nombres de las columnas
df0.columns = df0.columns.str.strip()

In [87]:
#Eliminar audio libros segun nuestros parametros
df = df0.loc[df0['num_pages']>=5]
df = df[~df['publisher'].str.contains("Audio",case=False)]
df = df.reset_index()

In [102]:
#Agregar columna año (de publicacion)
fechas_buenas = []
for i in range(df.shape[0]):
  fecha = df["publication_date"][i].split("/")
  fechas_buenas.append(fecha[2])
df["año"] = fechas_buenas

In [None]:
#Ver Idiomas
np.unique(df["language_code"])

In [None]:
#Unificar ingles
df.loc[df["language_code"]=="en-CA","language_code"] = "eng"
df.loc[df["language_code"]=="en-GB","language_code"] = "eng"
df.loc[df["language_code"]=="en-US","language_code"] = "eng"
np.unique(df["language_code"])

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
#Grafico libros por idioma
sns.set_context('paper')
plt.figure(figsize=(15,10))
ax = df.groupby('language_code')['title'].count().plot.bar()
plt.title('Language Code')
plt.xticks(fontsize = 15)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x()-0.3, p.get_height()+100))

In [None]:
#Grafico de numero de paginas vs nota promedio
plt.figure(figsize=(15,10))
sns.set_context('paper')
ax = sns.jointplot(x="average_rating", y="num_pages", data = df, color = 'crimson')
ax.set_axis_labels("Average Rating", "Number of Pages")

In [95]:
# Funcion para identificar el primer autor
def primer_autor(text):
    arr = text.split('/')
    return arr[0]

In [None]:
#Agregar columna primer autor
df['primer_autor'] = df['authors'].apply(lambda x : primer_autor(x))
total_rating = df.drop_duplicates(subset=['primer_autor', 'title'], keep='first')
total_rating = total_rating.groupby(by=['primer_autor']).agg({'average_rating': ['sum']})
total_rating.columns = ['total_rating']
total_rating.reset_index(inplace=True)
total_rating = total_rating.sort_values(by=['total_rating'], ascending=False)
total_rating

In [None]:
#Se agrupa por autor en orden descendente segun libros escritos
total_book = df.groupby(by=['primer_autor']).agg({'title': ['nunique']})
total_book.columns = ['total_book']
total_book.reset_index(inplace=True)
total_book = total_book.sort_values(by=['total_book'], ascending=False)
total_book

In [None]:
#Parametro adjusted_rating mas "equilibrado" considerando cantidad de libros y sus puntuaciones promedio
avg_author = pd.merge(total_book, total_rating, on='primer_autor', how='outer')
avg_author['average_rating'] = round(avg_author['total_rating'] / avg_author['total_book'], 2)
avg_author = avg_author[avg_author['total_book'] >0]
avg_author['adjusted_rating'] = avg_author['average_rating']*(avg_author['total_book']+20*avg_author['total_book'].mean())/20*avg_author['total_book'].mean()
avg_author = avg_author.sort_values(by=['adjusted_rating'], ascending=False)
avg_author.head(10)

In [None]:
#Grafico mejores autores segund adjusted_rating
sns.barplot(
    y = "primer_autor",
    x = "adjusted_rating",
    data = avg_author.head(20)
)

In [None]:
#DataFrame con autores por año
aux = df.groupby("año")
df_años = aux["authors"].count().reset_index()
df_años

In [None]:
#Grafico autores por año
fig = plt.figure( figsize= (30,60))
sns.barplot(
    y = "año",
    x = "authors",
    data = df_años
)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.xlabel("cantidad",fontsize = 42)
plt.ylabel("año",fontsize = 42)
plt.show()

### 2.1 Preprocesamiento

In [None]:
#Tomamos los datos numericos y los estandarizamos
df_numerico = df[["average_rating","año","num_pages","text_reviews_count","ratings_count"]]
scaler = StandardScaler()
df_numerico = pd.DataFrame(scaler.fit_transform(df_numerico),columns = ["average_rating","año","num_pages","text_reviews_count","ratings_count"])
df_numerico.head()

In [None]:
#Get_Dummies para la columna idioma
df_cat = df[["language_code"]]
df_cat = pd.get_dummies(df_cat)
df_cat

In [None]:
#Unimos df con datos numericos estandarizados con las columnas (ahora numericas) del idioma
df_procesado = pd.concat([df_numerico,df_cat],axis=1)
df_procesado

### 2.2 NearestNeighbors 

In [None]:
#Creamos modelo vecinos mas cercanos (5)
model = NearestNeighbors(n_neighbors = 6)
fit_model = model.fit(df_procesado)
distance, indices = fit_model.kneighbors(df_procesado)
indices

In [125]:
#Recomendador
id = 0
print(df["title"][id])
print("Se recomienda leer: ")
for e in indices[id]:
  if(e != id):
    print(df["title"][e])

Harry Potter and the Half-Blood Prince (Harry Potter  #6)
Se recomienda leer: 
Harry Potter and the Order of the Phoenix (Harry Potter  #5)
Animal Farm
Lord of the Flies
Harry Potter and the Chamber of Secrets (Harry Potter  #2)
Harry Potter and the Prisoner of Azkaban (Harry Potter  #3)


## **4**. **CODIGO**

In [29]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import GridSearchCV

from sklearn.neighbors import NearestNeighbors
from sklearn.ensemble import RandomTreesEmbedding
#from sklearn.neural_network import BernoulliRBM
#.

In [None]:
# read data

df = pd.read_csv("books.csv",error_bad_lines=False)
df.head()
#.

In [None]:
# Ver los idiomas de los libros

np.unique(df["language_code"])
#.

In [None]:
# Compactificar idiomas

df.loc[df["language_code"]=="en-CA","language_code"] = "eng"
df.loc[df["language_code"]=="en-GB","language_code"] = "eng"
df.loc[df["language_code"]=="en-US","language_code"] = "eng"
np.unique(df["language_code"])
#.

In [None]:
# Grafico de cant de libros vs idioma

sns.set_context('paper')
plt.figure(figsize=(15,10))
ax = df.groupby('language_code')['title'].count().plot.bar()
plt.title('Language Code')
plt.xticks(fontsize = 15)
for p in ax.patches:
    ax.annotate(str(p.get_height()), (p.get_x()-0.3, p.get_height()+100))
#.

In [None]:
df.describe()

In [34]:
# Eliminar espacios de los nombres de las columnas
df.columns = df.columns.str.strip()
#.

In [35]:
# Crear dataframe con libros not a book

df_not_a_book = df.loc[df['authors']=="NOT A BOOK"]
#.

In [None]:
# Ver las editoriales de df_audios

a = np.unique(df_not_a_book["publisher"])
a.shape
#.

In [None]:
# Crear dataframe con libros con numero de paginas menor a 5
# consejo: visualizar cuantos tienen menos de 5 pag

df_libro_chiquito = df.loc[df['num_pages']<5]
np.unique(df_libro_chiquito["publisher"])
#.

In [38]:
# Crear dataframe con los que son libros segun nuestros parametros
df_libros = df.loc[df['num_pages']>=5]
df_libros = df_libros[~df_libros['publisher'].str.contains("Audio",case=False)]
#df = df.loc["audio" in df['publisher']]
df_libros = df_libros.reset_index()
#.

In [None]:
# Grafico de numero de paginas vs nota promedio

plt.figure(figsize=(15,10))
sns.set_context('paper')
ax = sns.jointplot(x="average_rating", y="num_pages", data = df, color = 'crimson')
ax.set_axis_labels("Average Rating", "Number of Pages")
#.

In [None]:
# Se define una funcion para identificar el primer autor

def primer_autor(text):
    arr = text.split('/')
    return arr[0]
# Se crea una nueva columna con el primer autor
df['primer_autor'] = df['authors'].apply(lambda x : primer_autor(x))

total_rating = df.drop_duplicates(subset=['primer_autor', 'title'], keep='first')
total_rating = total_rating.groupby(by=['primer_autor']).agg({'average_rating': ['sum']})
total_rating.columns = ['total_rating']
total_rating.reset_index(inplace=True)
total_rating = total_rating.sort_values(by=['total_rating'], ascending=False)
total_rating
#.

In [None]:
total_book = df.groupby(by=['primer_autor']).agg({'title': ['nunique']})
total_book.columns = ['total_book']
total_book.reset_index(inplace=True)
total_book = total_book.sort_values(by=['total_book'], ascending=False)
total_book
#.

In [None]:
avg_author = pd.merge(total_book, total_rating, on='primer_autor', how='outer')
avg_author['average_rating'] = round(avg_author['total_rating'] / avg_author['total_book'], 2)
avg_author = avg_author[avg_author['total_book'] >0]
avg_author['adjusted_rating'] = avg_author['average_rating']*(avg_author['total_book']+20*avg_author['total_book'].mean())/20*avg_author['total_book'].mean()
avg_author = avg_author.sort_values(by=['adjusted_rating'], ascending=False)
avg_author.head(10)
#.

In [None]:
sns.barplot(
    y = "primer_autor",
    x = "adjusted_rating",
    data = avg_author.head(20)
)
#.

In [None]:
fechas_buenas = []
for i in range(df.shape[0]):
  fecha = df["publication_date"][i].split("/")
  if len(fecha[0])==1:
    fecha[0] = "0"+fecha[0]
  if len(fecha[1])==1:
    fecha[1] = "0"+fecha[1]
  fechas_buenas.append(fecha[2])
df["año"] = fechas_buenas
#.

In [None]:
df.head()


In [None]:
np.unique(df["año"])

In [None]:
grupo = df.groupby("año")
anios = grupo["authors"].count().reset_index()
anios
#.

In [None]:
fig = plt.figure( figsize= (30,60))
sns.barplot(
    y = "año",
    x = "authors",
    data = anios
)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.xlabel("año",fontsize = 42)
plt.ylabel("cantidad",fontsize = 42)
plt.show()
#.

HASTA AQUÍ TAMO BIEN

- Ahora, propongo usar el kmeans para crear un modelo de clasificación piola y despues vemos que hacemos con los otros modelos de clasificación y que tanto nos sirve cada uno ( 

In [None]:
df_numerico = df[["average_rating","año","num_pages","text_reviews_count","ratings_count"]]
scaler = StandardScaler() #estandarizamos los datos
df_numerico = pd.DataFrame(scaler.fit_transform(df_numerico),columns = ["average_rating","año","num_pages","text_reviews_count","ratings_count"])
df_numerico.head()
#.

In [None]:
df_cat = df[["language_code"]]
df_cat = pd.get_dummies(df_cat)
df_cat
#.

In [None]:
df_procesado = pd.concat([df_numerico,df_cat],axis=1)
df_procesado
#.

In [118]:
model = NearestNeighbors(n_neighbors = 6)
fit_model = model.fit(df_procesado)
distance, indices = fit_model.kneighbors(df_procesado)
indices
#.

array([[    0,     1,  2081,  2083,  4339,     3],
       [    1,     0,  4339,  2081,     3,  2083],
       [    2,  9764,  6763,  3048,  3053,  8733],
       ...,
       [10891,  3084,  2308,  4976,  2706,  1824],
       [10892,  6612,  3728,  5848,  6726,  7358],
       [10893,   350,  3426,  4416, 10846,  1427]])

In [None]:
indices.shape

(11123, 6)

In [119]:
df["title"][10893]

'Las aventuras de Tom Sawyer'