# CLEANING AND FORMATTING DATA

In [1]:
import pandas as pd

#Reading DataFrames from csv files

top_ficcion = pd.read_csv('top_ficcion.csv')
top_no_ficcion = pd.read_csv('top_no_ficcion.csv')
top_comics = pd.read_csv('top_comics.csv')
top_juveniles = pd.read_csv('top_juveniles.csv')
top_ficcion_mas_leido = pd.read_csv('top_ficcion_mas_leido.csv')


In [2]:
# FICTION category

top_ficcion.head(10)

Unnamed: 0,title,author,version,price
0,NOVIA,ALI HAZELWOOD,Tapa blanda,1800
1,TRES ENIGMAS PARA LA ORGANIZACIÓN,EDUARDO MENDOZA,Tapa blanda,2080
2,EL HIJO OLVIDADO,MIKEL SANTIAGO,Tapa blanda,2175
3,LA PACIENTE SILENCIOSA,ALEX MICHAELIDES,Tapa blanda,1040
4,EL BARRACON DE LAS MUJERES,FERMINA CAÑAVERAS,Tapa blanda,1985
5,PARÍS DESPERTABA TARDE,MAXIM HUERTA,Tapa dura,1985
6,LAS HIJAS DE LA CRIADA (PREMIO PLANETA 2023),SONSOLES ONEGA,Tapa dura,2175
7,LA RED PÚRPURA (EDICIÓN LIMITADA) (LA NOVIA GI...,CARMEN MOLA,Tapa blanda,755
8,EL ANGEL DE LA CIUDAD ((EJEMPLAR FIRMADO POR L...,EVA GARCIA SAENZ DE URTURI,Tapa dura,2080
9,SCARRED: UNA HISTORIA DE NUNCA JAMAS,EMILY MCINTIRE,Tapa blanda,1990


In [3]:
#Checking the number and columns and rows in the DataFrame

top_ficcion.shape

(10, 4)

In [4]:
#Checking  DataFrame's columns, their data types, non-null values, and memory usage.

top_ficcion.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    10 non-null     object
 1   author   10 non-null     object
 2   version  10 non-null     object
 3   price    10 non-null     object
dtypes: object(4)
memory usage: 452.0+ bytes


In [5]:
#Checking for Nans

top_ficcion.isna().sum()

title      0
author     0
version    0
price      0
dtype: int64

In [6]:
#Checking for duplicates

top_ficcion.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8    False
9    False
dtype: bool

In [7]:
# Changing the columns names with upper()

top_ficcion.columns = [col.upper() for col in top_ficcion.columns]
top_ficcion.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
0,NOVIA,ALI HAZELWOOD,Tapa blanda,1800
1,TRES ENIGMAS PARA LA ORGANIZACIÓN,EDUARDO MENDOZA,Tapa blanda,2080
2,EL HIJO OLVIDADO,MIKEL SANTIAGO,Tapa blanda,2175
3,LA PACIENTE SILENCIOSA,ALEX MICHAELIDES,Tapa blanda,1040
4,EL BARRACON DE LAS MUJERES,FERMINA CAÑAVERAS,Tapa blanda,1985


In [8]:
# Changing the text in the TITLE column with capitalize()

top_ficcion["TITLE"] = top_ficcion["TITLE"].str.capitalize()
top_ficcion.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
0,Novia,ALI HAZELWOOD,Tapa blanda,1800
1,Tres enigmas para la organización,EDUARDO MENDOZA,Tapa blanda,2080
2,El hijo olvidado,MIKEL SANTIAGO,Tapa blanda,2175
3,La paciente silenciosa,ALEX MICHAELIDES,Tapa blanda,1040
4,El barracon de las mujeres,FERMINA CAÑAVERAS,Tapa blanda,1985


In [9]:
# Changing the text in the AUTHOR column with title()

top_ficcion["AUTHOR"] = top_ficcion["AUTHOR"].str.title()
top_ficcion.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
0,Novia,Ali Hazelwood,Tapa blanda,1800
1,Tres enigmas para la organización,Eduardo Mendoza,Tapa blanda,2080
2,El hijo olvidado,Mikel Santiago,Tapa blanda,2175
3,La paciente silenciosa,Alex Michaelides,Tapa blanda,1040
4,El barracon de las mujeres,Fermina Cañaveras,Tapa blanda,1985


In [10]:
# Changing the text in the VERSION column with lower()

top_ficcion["VERSION"] = top_ficcion["VERSION"].str.lower()
top_ficcion.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
0,Novia,Ali Hazelwood,tapa blanda,1800
1,Tres enigmas para la organización,Eduardo Mendoza,tapa blanda,2080
2,El hijo olvidado,Mikel Santiago,tapa blanda,2175
3,La paciente silenciosa,Alex Michaelides,tapa blanda,1040
4,El barracon de las mujeres,Fermina Cañaveras,tapa blanda,1985


In [11]:
# Changing DataFrame index to start from 1 for a better insight of top 10 

top_ficcion.index = top_ficcion.index + 1
top_ficcion.head()


Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
1,Novia,Ali Hazelwood,tapa blanda,1800
2,Tres enigmas para la organización,Eduardo Mendoza,tapa blanda,2080
3,El hijo olvidado,Mikel Santiago,tapa blanda,2175
4,La paciente silenciosa,Alex Michaelides,tapa blanda,1040
5,El barracon de las mujeres,Fermina Cañaveras,tapa blanda,1985


In [17]:
#Applying the preprocessing to the other DataFrames

def preprocess_dataframe(df):
    # Convert column names to uppercase
    df.columns = [col.upper() for col in df.columns]
    
    # Capitalize the 'TITLE' column
    df["TITLE"] = df["TITLE"].str.capitalize()
    
    # Convert the 'AUTHOR' column to title case
    df["AUTHOR"] = df["AUTHOR"].str.title()
    
    # Convert the 'VERSION' column to lowercase
    df["VERSION"] = df["VERSION"].str.lower()
    
    # Reset the index to start from 1
    df.index = df.index + 1 
    
    return df


tops = [top_no_ficcion, top_comics, top_juveniles, top_ficcion_mas_leido]


# Apply the preprocessing function to each DataFrame in the list

preprocessed_dfs = [preprocess_dataframe(df) for df in tops]

In [18]:
# Checking if all applied to NO FICTION category

top_no_ficcion.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
1,Algo que sirva como luz,Fernando Navarro Cano,tapa blanda,1890
2,Tu cerebro tiene hambre,Boticaria Garcia,tapa blanda,1890
3,Deja de ser tu. la mente crea la realidad,Joe Dispenza,otros,1710
4,Carne gobernada,Fernando Savater,tapa blanda,1985
5,Habitos atomicos,James Clear,tapa blanda,1890


In [19]:
# Checking if all applied to COMICS category

top_comics.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
1,El pequeño libro del amor,72 Kilos,tapa dura,1700
2,Heartstopper 5. creciendo contigo,Alice Oseman,tapa blanda,1515
3,El abismo del olvido,Paco Roca Y Rodrigo Terrasa,tapa dura,2375
4,Focus - alvaro martínez bueno: the nice house ...,James Tynion Iv,tapa dura,3277
5,El viaje de shuna,Hayao Miyazaki,tapa dura,2280


In [20]:
# # Checking if all applied to JUVENILES category

top_juveniles.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
1,Alas de hierro (empireo 2),Rebecca Yarros,tapa dura,2270
2,Todos los lugares que mantuvimos en secreto (e...,Inma Rubiales,tapa dura,1795
3,Un fuego en la carne (saga de carne y fuego 3),Jennifer L. Armentrout,otros,1890
4,Casa de llama y sombra (ciudad medialuna 3),Sarah J. Maas,tapa blanda,2085
5,Una corte de rosas y espinas. edición especial,Sarah J. Maas,tapa dura,2180


In [21]:
# # Checking if all applied to TOP FICTION MOST READ category

top_ficcion_mas_leido.head()

Unnamed: 0,TITLE,AUTHOR,VERSION,PRICE
1,La chica del verano (novela),La Vecina Rubia,tapa blanda,1890
2,El problema final,Arturo Perez Reverte,tapa dura,2080
3,Alas de sangre (empireo 1),Rebecca Yarros,tapa dura,2175
4,El cuco de cristal,Javier Castillo,tapa blanda,1985
5,Las hijas de la criada (premio planeta 2023),Sonsoles Onega,tapa dura,2175


In [22]:
#Saving DataFrames I've created to .csv for further works

top_ficcion.to_csv('top_ficcion_clean.csv', index=False)
top_no_ficcion.to_csv('top_no_ficcion_clean.csv', index=False)
top_comics.to_csv('top_comics_clean.csv', index=False)
top_juveniles.to_csv('top_juveniles_clean.csv', index=False)
top_ficcion_mas_leido.to_csv('top_ficcion_mas_leido_clean.csv', index=False)