## Checking Bias in the data ##

### Results: ###
### Western / US authors have the highest share among authors in the dataset. ###
### Most readers report that they are located in Western countries / USA ###

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%run import_data

In [None]:
all_cleaned = pd.read_csv('data/all_cleaned.csv')
all_cleaned.head()

#### The countries are in different languages and a couple of unclean data are still in the list; nevertheless, most readers are located in Western countries ####

In [None]:
print(all_cleaned['country'].value_counts().to_string())

In [None]:
print(all_cleaned['mod_book_author'].value_counts().to_string())

#### the most prevalent book authors are US authors and the most prevalent country is the US  ####

In [None]:
top_authors = all_cleaned['mod_book_author'].value_counts().head(10).plot(kind='bar')

In [None]:
all_cleaned[all_cleaned['mod_book_author'] == 'zev ben shimon halevi']

In [None]:
# cleaning data: extracting surname, name from author and removing additional abbreviations 
!pip install nameparser


In [9]:
from nameparser import HumanName

# Function to extract surname and given name
def extract_name_surname(full_name):
    name = HumanName(full_name)
    return f"{name.first} {name.last}"

In [None]:
# Apply the function to create the new column
all_cleaned['name_surname'] = all_cleaned['mod_book_author'].apply(extract_name_surname)
all_cleaned.head()

In [None]:
print(all_cleaned['name_surname'].value_counts().to_string())

In [None]:
print(all_cleaned['mod_book_author'].value_counts().to_string())

In [None]:
all_cleaned[all_cleaned['mod_book_author'] == 'dean r koontz']  

In [None]:
all_cleaned[all_cleaned['mod_book_author'] == 'dean koontz']  

In [None]:
all_cleaned[all_cleaned['mod_book_author'] == 'dr koontz']  

In [None]:
# checking koontz name variations
pattern = r'\bkoontz\b'

# Filter the DataFrame
koontz_authors = all_cleaned[all_cleaned['mod_book_author'].str.contains(pattern, case=False, regex=True)]

print(koontz_authors)

In [None]:
# checking rowling name variations
pattern = r'\browling\b'

# Filter the DataFrame
rowling_authors = all_cleaned[all_cleaned['mod_book_author'].str.contains(pattern, case=False, regex=True)]

print(rowling_authors)

In [None]:
all_cleaned['name_surname'].nunique()

In [None]:
top_authors = all_cleaned['name_surname'].value_counts().head(10).plot(kind='bar')

In [None]:
# checking duplicates of author names

all_cleaned['surname'] = all_cleaned['name_surname'].apply(lambda x: x.split()[-1] if x.strip() else '')
#all_cleaned


# Group by surname and count unique names
surname_counts = all_cleaned.groupby('surname')['name_surname'].value_counts().sort_values(ascending=False)
print(surname_counts)

In [21]:
# save data to csv
all_cleaned.to_csv('data/all_cleaned.csv', index=False) 