In [None]:
!pip pandas
!pip os 

<span style="font-family: 'Bahnschrift Light'; font-size: 22px">Cleaning up data</span>

In [54]:
import os 

path_current = os.getcwd()
path_data = os.path.abspath(os.path.join(path_current, '..', 'google_books_1299.csv'))
path_data_clean = os.path.abspath(f'{path_current}/src/assets')

In [55]:
import pandas as pd
union_ = pd.read_csv('C:/Users/jemss/OneDrive/Escritorio/react/links_output.csv', sep=';')

In [57]:
import pandas as pd

try:
    df = pd.read_csv(path_data, encoding="utf-8") # Uploading data
except UnicodeDecodeError as e:
    print(f"Error: {e}")

df = df.head(50).reset_index(drop=True) # Limiting to 50 rows
df = df.drop_duplicates(subset='title').reset_index(drop=True) # Dropping duplicates
try:
    df['images'] = union_['link'] # Adding images
except ValueError as e:
    print(f"Error: {e}")

# Dropping unnecessary columns
df = df.drop(columns=['currency']) 
df = df.rename(columns={'Unnamed: 0': 'index'})

# Converting columns to appropriate data types
df['price'] = df['price'].astype(str).str.replace(r'\..*', '.000', regex=True)
df['voters'] = df['voters'].astype(str).str.replace(',', '').replace('nan', '0').astype(int)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0).astype(float)
df['published_date'] = df['published_date'].astype(str).str.strip().str.replace(',', ' de', regex=True)
df['generes'] = df['generes'].replace('none', 'Undefined') 

In [58]:
df.columns

Index(['index', 'title', 'author', 'rating', 'voters', 'price', 'description',
       'publisher', 'page_count', 'generes', 'ISBN', 'language',
       'published_date', 'images'],
      dtype='object')

In [59]:
# Cleaning the writing of genres
df['generes'] = (
    df['generes']
    .str.replace('&amp', '', regex=False)
    .str.replace('&', 'and', regex=False)
    .str.replace(r'\s*\([^)]*\)', '', regex=True)  # Remove text in parentheses
    .str.strip()
)

In [60]:
df.info() # Displaying DataFrame information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           45 non-null     int64  
 1   title           45 non-null     object 
 2   author          45 non-null     object 
 3   rating          45 non-null     float64
 4   voters          45 non-null     int64  
 5   price           45 non-null     object 
 6   description     45 non-null     object 
 7   publisher       45 non-null     object 
 8   page_count      45 non-null     int64  
 9   generes         45 non-null     object 
 10  ISBN            45 non-null     object 
 11  language        45 non-null     object 
 12  published_date  45 non-null     object 
 13  images          45 non-null     object 
dtypes: float64(1), int64(3), object(10)
memory usage: 5.1+ KB


In [123]:
u = df['voters'].unique()
u = sorted(u)
u = pd.Series(u)
print(u.median())

135.0


In [125]:
media = df['rating'].mean() # Calculating the mean rating
print(len(df[df['rating'] > media].sort_values(by='rating', ascending=False)))
print(media)

32
4.24888888888889


In [61]:
def change_format_to_json(df, route, kwargs): # Function to convert DataFrame to JSON
    try:
        filename = os.path.abspath(f'{route}/{kwargs['path_or_buf']}.json')
        kwargs.pop('path_or_buf')
        df.to_json(filename,**kwargs)# lalala
    except Exception as e: 
        print(f"Error: {e}")
        
params = {'path_or_buf': 'google_books_1299_clean_1', 'orient': 'records', 'force_ascii': False}
change_format_to_json(df, path_data_clean, params) 

In [62]:
df.head(5)

Unnamed: 0,index,title,author,rating,voters,price,description,publisher,page_count,generes,ISBN,language,published_date,images
0,0,Attack on Titan: Volume 13,Hajime Isayama,4.6,428,43.0,NO SAFE PLACE LEFT At great cost to the Garris...,Kodansha Comics,192,Undefined,9781612626864,English,Jul 31 de 2014,https://panamericana.vtexassets.com/arquivos/i...
1,1,Antiques Roadkill: A Trash 'n' Treasures Mystery,Barbara Allan,3.3,23,26.0,Determined to make a new start in her quaint h...,Kensington Publishing Corp.,288,"Fiction , Mystery , Detective , Cozy , General",9780758272799,English,Jul 1 de 2007,https://m.media-amazon.com/images/I/71Gvt7mZZc...
2,2,The Art of Super Mario Odyssey,Nintendo,3.9,9,133.0,Take a globetrotting journey all over the worl...,Dark Horse Comics,368,"Games , Activities , Video , Electronic",9781506713816,English,Nov 5 de 2019,https://images.cdn3.buscalibre.com/fit-in/360x...
3,3,Getting Away Is Deadly: An Ellie Avery Mystery,Sara Rosett,4.0,10,26.0,"With swollen feet and swelling belly, pregnant...",Kensington Publishing Corp.,320,Undefined,9781617734076,English,Mar 1 de 2009,https://m.media-amazon.com/images/I/91BGvVjl+v...
4,4,"The Painted Man (The Demon Cycle, Book 1)",Peter V. Brett,4.5,577,28.0,The stunning debut fantasy novel from author P...,HarperCollins UK,544,"Fiction , Fantasy , Dark Fantasy",9780007287758,English,Jan 8 de 2009,https://m.media-amazon.com/images/I/81dzr3Bumy...


<span style="font-family: 'Bahnschrift Light'; font-size: 22px">Using One Hot Encoding for book categories</span>

In [205]:
df_cop = df.copy() # Creating a copy of the DataFrame

In [206]:
# Obtaining unique genres
unique_gen = set()
for i in df_cop['generes']:
    for gen in str(i).split(','): 
        gen = gen.strip() # Removing leading/trailing spaces
        unique_gen.add(gen)

for gen in unique_gen:
    df_cop[gen] = df_cop['generes'].apply(lambda x: gen in str(x)) # Creating new columns for each genre

In [198]:
print(f'{len(unique_gen)}\n', list(unique_gen)) 

45
 ['Women Sleuths', 'Juvenile Fiction', 'Comics &amp', 'Activities', 'Baking', 'Dragons &amp', 'Detective', 'Adventure', 'Dark Fantasy', 'Electronic', 'Business &amp', 'Cooking', 'Comics & Graphic Novels', 'Science Fiction', 'Games &amp', 'Methods', 'Graphic Novels', 'Superheroes', 'Humorous Stories', 'Accounting', 'Undefined', 'Social Science', 'Fiction', 'Biography &amp', 'Classics', 'Action &amp', 'Mythical Creatures', 'Financial', 'General', 'Military', 'Economics', 'Mystery &amp', 'Video &amp', 'Fantasy', 'Cozy', 'Media Tie-In', 'Noir', 'Sports', 'Epic', 'Autobiography', 'Literary Criticism', 'Leadership', 'Women', 'Motivational', 'Medical (incl. Patients)']


In [None]:
change_format_to_json(df_cop, path_data_clean, {'path_or_buf': 'google_books_1299_clean_copy', 'orient': 'records', 'force_ascii': False} )

In [161]:
c = ['Attack on Titan: Volume 13',
       "Antiques Roadkill: A Trash 'n' Treasures Mystery",
       'The Art of Super Mario Odyssey',
       'Getting Away Is Deadly: An Ellie Avery Mystery',
       'The Painted Man (The Demon Cycle, Book 1)',
       'A Feast for Crows (A Song of Ice and Fire, Book 4)',
       'God of War: The Official Novelization',
       'Edgedancer: From the Stormlight Archive',
       'Blood, Sweat, and Pixels: The Triumphant, Turbulent Stories Behind How Video Games Are Made',
       'Twas The Nightshift Before Christmas: Festive hospital diaries from the author of million-copy hit This is Going to Hurt',
       'Sword of Destiny: Witcher 2: Tales of the Witcher',
       'The Mysterious Affair at Styles (Poirot)',
       'Riley Paige Mystery Bundle: Once Gone (#1) and Once Taken (#2)',
       'Deadpool Kills the Marvel Universe',
       'The Red Signal: An Agatha Christie Short Story',
       "Crossroads of Twilight: Book Ten of 'The Wheel of Time'",
       'My Little Pony: Friendship is Magic #83',
       'That Time I got Reincarnated as a Slime 11', 'Ask A Footballer',
       'Sonic the Hedgehog #3', 'Ultimate Spider-Man Vol. 11: Carnage',
       'The Last Wife: An absolutely gripping and emotional page turner with a brilliant twist',
       "The Hitchhiker's Guide to the Galaxy: 42nd Anniversary Edition",
       'Prince of Thorns (The Broken Empire, Book 1)',
       'Influence: The Psychology of Persuasion',
       'Critical Role Vox Machina: Origins: Volume 1',
       'Introducing Critical Theory: A Graphic Guide', 'Morning Star',
       'Tall Tales and Wee Stories: The Best of Billy Connolly',
       'After Anna', 'How to Win Friends and Influence People',
       'The Empty Nest: An unputdownably gripping psychological thriller',
       'W is for Wasted: A Kinsey Millhone Novel', 'Salvaged',
       'The Art of Thinking Clearly',
       'The Complete Works of William Shakespeare: All 213 Plays, Poems, Sonnets, Apocryphal Plays + The Biography: The Life of William Shakespeare by Sidney Lee: Hamlet - Romeo and Juliet - King Lear - A Midsummer Night’s Dream - Macbeth - The Tempest - Othello and many more',
       'The Daylight War: Book Three of The Demon Cycle',
       'The Ultimate Secrets of Total Self-Confidence',
       'Ready Player One',
       'A Forge of Valor (Kings and Sorcerers--Book 4)',
       'How To Win Friends and Influence People',
       'Financial Statements, Revised and Expanded Edition: A Step-by-Step Guide to Understanding and Creating Financial Reports',
       'Death of a Bachelorette', 'Batman: The Man Who Laughs',
       "Martha Stewart's Cookie Perfection: 100+ Recipes to Take Your Sweet Treats to the Next Level: A Baking Book"]

c[-22:]

['Prince of Thorns (The Broken Empire, Book 1)',
 'Influence: The Psychology of Persuasion',
 'Critical Role Vox Machina: Origins: Volume 1',
 'Introducing Critical Theory: A Graphic Guide',
 'Morning Star',
 'Tall Tales and Wee Stories: The Best of Billy Connolly',
 'After Anna',
 'How to Win Friends and Influence People',
 'The Empty Nest: An unputdownably gripping psychological thriller',
 'W is for Wasted: A Kinsey Millhone Novel',
 'Salvaged',
 'The Art of Thinking Clearly',
 'The Complete Works of William Shakespeare: All 213 Plays, Poems, Sonnets, Apocryphal Plays + The Biography: The Life of William Shakespeare by Sidney Lee: Hamlet - Romeo and Juliet - King Lear - A Midsummer Night’s Dream - Macbeth - The Tempest - Othello and many more',
 'The Daylight War: Book Three of The Demon Cycle',
 'The Ultimate Secrets of Total Self-Confidence',
 'Ready Player One',
 'A Forge of Valor (Kings and Sorcerers--Book 4)',
 'How To Win Friends and Influence People',
 'Financial Statements, 