In [None]:
!pip pandas
!pip os 

<span style="font-family: 'Bahnschrift Light'; font-size: 22px">Cleaning up data</span>

In [7]:
import os 

path_current = os.getcwd()
path_data = os.path.abspath(os.path.join(path_current, '..', 'google_books_1299.csv'))
path_data_clean = os.path.abspath(f'{path_current}/src/assets')

In [8]:
import pandas as pd

try:
    df = pd.read_csv(path_data, encoding="utf-8") # Uploading data
except UnicodeDecodeError as e:
    print(f"Error: {e}")

df = df.head(50).reset_index(drop=True) # Limiting to 50 rows

# Dropping unnecessary columns
df = df.drop(columns=['currency']) 
df = df.rename(columns={'Unnamed: 0': 'index'})

# Converting columns to appropriate data types
df['price'] = df['price'].astype(str).str.replace(',', '').str.replace('$', '').astype(float)
df['voters'] = df['voters'].astype(str).str.replace(',', '').replace('nan', '0').astype(int)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce').fillna(0).astype(float)
df['published_date'] = df['published_date'].astype(str).str.strip().str.replace(',', ' de', regex=True)
df['generes'] = df['generes'].replace('none', 'Undefined') 

In [9]:
# Cleaning the writing of genres
df['generes'] = (
    df['generes']
    .str.replace('&amp', '', regex=False)
    .str.replace('&', 'and', regex=False)
    .str.replace(r'\s*\([^)]*\)', '', regex=True)  # Remove text in parentheses
    .str.strip()
)

In [32]:
df.info() # Displaying DataFrame information

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   index           50 non-null     int64  
 1   title           50 non-null     object 
 2   author          50 non-null     object 
 3   rating          50 non-null     float64
 4   voters          50 non-null     int64  
 5   price           50 non-null     float64
 6   description     50 non-null     object 
 7   publisher       50 non-null     object 
 8   page_count      50 non-null     int64  
 9   generes         50 non-null     object 
 10  ISBN            50 non-null     object 
 11  language        50 non-null     object 
 12  published_date  50 non-null     object 
dtypes: float64(2), int64(3), object(8)
memory usage: 5.2+ KB


In [None]:
u = df['voters'].unique()
u = sorted(u)
u = pd.Series(u)

In [35]:
print(u) 

[    0     1     3     4     6     7     9    10    13    15    23    42
    45    47    57    64    72    94   100   135   200   206   221   281
   288   383   408   427   428   522   577   832   861   886  1382  1633
  6615 17719 38526]


In [39]:
print(u.median())

135.0


In [None]:
media = df['rating'].mean() # Calculating the mean rating
print(media)

4.274


In [15]:
len(df[df['rating'] > media].sort_values(by='rating', ascending=False))

36

In [None]:
def change_format_to_json(df, route, kwargs): # Function to convert DataFrame to JSON
    try:
        filename = os.path.abspath(f'{route}/{kwargs['path_or_buf']}.json')
        kwargs.pop('path_or_buf')
        df.to_json(filename,**kwargs)# lalala
    except Exception as e: 
        print(f"Error: {e}")
        
params = {'path_or_buf': 'google_books_1299_clean_1', 'orient': 'records', 'force_ascii': False}
change_format_to_json(df, path_data_clean, params) 

<span style="font-family: 'Bahnschrift Light'; font-size: 22px">Using One Hot Encoding for book categories</span>

In [205]:
df_cop = df.copy() # Creating a copy of the DataFrame

In [206]:
# Obtaining unique genres
unique_gen = set()
for i in df_cop['generes']:
    for gen in str(i).split(','): 
        gen = gen.strip() # Removing leading/trailing spaces
        unique_gen.add(gen)

for gen in unique_gen:
    df_cop[gen] = df_cop['generes'].apply(lambda x: gen in str(x)) # Creating new columns for each genre

In [198]:
print(f'{len(unique_gen)}\n', list(unique_gen)) 

45
 ['Women Sleuths', 'Juvenile Fiction', 'Comics &amp', 'Activities', 'Baking', 'Dragons &amp', 'Detective', 'Adventure', 'Dark Fantasy', 'Electronic', 'Business &amp', 'Cooking', 'Comics & Graphic Novels', 'Science Fiction', 'Games &amp', 'Methods', 'Graphic Novels', 'Superheroes', 'Humorous Stories', 'Accounting', 'Undefined', 'Social Science', 'Fiction', 'Biography &amp', 'Classics', 'Action &amp', 'Mythical Creatures', 'Financial', 'General', 'Military', 'Economics', 'Mystery &amp', 'Video &amp', 'Fantasy', 'Cozy', 'Media Tie-In', 'Noir', 'Sports', 'Epic', 'Autobiography', 'Literary Criticism', 'Leadership', 'Women', 'Motivational', 'Medical (incl. Patients)']


In [None]:
change_format_to_json(df_cop, path_data_clean, {'path_or_buf': 'google_books_1299_clean_copy', 'orient': 'records', 'force_ascii': False} )