In [529]:
#Import packages
import pandas as pd
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from pathlib import Path

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop = stopwords.words('english')

nltk.download('omw-1.4') #Download OpenMultilingualWordnet
nltk.download('wordnet')
wnl = nltk.WordNetLemmatizer()


fp_data = Path.cwd() / "data"
Path.mkdir(fp_data, exist_ok=True) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/annanielsen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/annanielsen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/annanielsen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [530]:
# Import data
df = pd.read_csv(fp_data / 'bookssave100.csv')
df=df.drop('Unnamed: 0', axis=1)
df = df.dropna(axis='rows').reset_index(drop=True)

#droping duplicates 
df=df.drop_duplicates().reset_index(drop=True)

In [531]:
#Defines function that takes a text data column and processes it.
def preprocess(column):
    column = column.lower() #To lower case
    column = re.sub(r'<[^>]*>', ' ', column) #Remove HTML
    column = re.sub(r'[^\w\s]','', column) #Remove non-alphanumeric characters

    q = column.split() #Split sentences at whitespace
    w = [e for e in q if e not in stop] #Removes "stop-word" e.g. "The" and "is"
    u = [wnl.lemmatize(e) for e in w] #Finds the grammatically correct form of the word (the lemma).
    return u

In [532]:
# Renames the old columns to the new processed columns
df_be = df['Description'].apply(preprocess) 
df_ti = df['Title'].apply(preprocess)

df_fin = df.copy()
df_fin['proc_de'] = df_be
df_fin['proc_ti'] = df_ti

In [533]:
#Dropping books without Author specified 
df_fin = df_fin.drop(df_fin[df_fin['Author'].str.contains('Academy', case=False, na=False)].index)
df_fin = df_fin.drop(df_fin[df_fin['Author'].str.contains('Science Association', case=False, na=False)].index)

In [534]:
#Splits the string containing the names of the authors, into a list of strings, each string containing the name of one author.
df_fin['names_column'] = df_fin['Author'].str.split(',')

#Drop if there is more than 4 Authors
df_fin = df_fin[df_fin['names_column'].apply(len) <= 4]
df_fin = df_fin.reset_index(drop=True)
#Splits the name column into three columns, each containing one name.
df_fin[['Author1', 'Author2', 'Author3', 'Author4']] = pd.DataFrame(df_fin['names_column'].tolist())

In [535]:
# Deletes all the names that are not authors, i.e. the names with a parenthesis afterwards.
import gender_guesser.detector as gender
d = gender.Detector()

def process_name(name):
    if pd.notna(name):
        if '(' in name or ')' in name:
            return None
        
        stripped_name = name.strip()
        parts = stripped_name.split(' ', 1)
        first_name = parts[0]
        
        gender = d.get_gender(first_name)  # Assuming d is defined somewhere

        return gender
    else:
        return None

# Applies the function to the name columns
df_fin['Author1'] = df_fin['Author1'].apply(lambda x: process_name(x))
df_fin['Author2'] = df_fin['Author2'].apply(lambda x: process_name(x))
df_fin['Author3'] = df_fin['Author3'].apply(lambda x: process_name(x))
df_fin['Author4'] = df_fin['Author4'].apply(lambda x: process_name(x))

In [536]:
df_fin = df_fin.drop(df_fin[df_fin['Author1'].str.contains('unknown', case=False, na=False)].index)
df_fin = df_fin.drop(df_fin[df_fin['Author2'].str.contains('unknown', case=False, na=False)].index)
df_fin = df_fin.drop(df_fin[df_fin['Author3'].str.contains('unknown', case=False, na=False)].index)
df_fin = df_fin.drop(df_fin[df_fin['Author4'].str.contains('unknown', case=False, na=False)].index)

In [537]:
# Replaces the words of the genders with numbers.
gender_mapping = {'male': 1, 'female': 2, 'unknown': 3, 'mostly_male': 1, 'mostly_female': 2}
df_fin['Author1'] = df_fin['Author1'].map(gender_mapping)     
df_fin['Author2'] = df_fin['Author2'].map(gender_mapping)     
df_fin['Author3'] = df_fin['Author3'].map(gender_mapping)
df_fin['Author4'] = df_fin['Author4'].map(gender_mapping)
# Replaces empty name cells with the value 0
df_fin['Author1'].fillna(value=0, inplace=True)
df_fin['Author2'].fillna(value=0, inplace=True)
df_fin['Author3'].fillna(value=0, inplace=True)
df_fin['Author4'].fillna(value=0, inplace=True)

In [538]:
# Drop if published year is outlier
df_fin = df_fin.drop(df_fin[df_fin['Published_year'] < 1001].index)
df_fin = df_fin.drop(df_fin[df_fin['Published_year'] > 2023].index)

mask = (df_fin['Author1'] == 0) & (df_fin['Author2'] == 0) & (df_fin['Author3'] == 0) & (df_fin['Author4'] == 0)
df_fin = df_fin.drop(df_fin[mask].index)
df_fin

Unnamed: 0,Title,Author,Avg_ratings,nratings,nreviews,Description,Pages,Published_year,proc_de,proc_ti,names_column,Author1,Author2,Author3,Author4
0,Sanctorum Communio: A Theological Study of the...,"Dietrich Bonhoeffer, Clifford J. Green",3.96,2716,31,Sanctorum Communio was Bonhoeffer's dissertati...,392,1930,"[sanctorum, communio, bonhoeffer, dissertation...","[sanctorum, communio, theological, study, soci...","[Dietrich Bonhoeffer, Clifford J. Green]",1.0,1.0,0.0,0.0
1,The Social Construction of Reality: A Treatise...,"Peter L. Berger, Thomas Luckmann",4.12,3876,217,"Called the ""fifth-most important sociological ...",219,1966,"[called, fifthmost, important, sociological, b...","[social, construction, reality, treatise, soci...","[Peter L. Berger, Thomas Luckmann]",1.0,1.0,0.0,0.0
3,Economy and Society: An Outline of Interpretiv...,"Max Weber, Guenther Roth (Editor), Claus Witti...",4.12,1439,31,Max Weber's Economy and Society is the greates...,1469,1922,"[max, weber, economy, society, greatest, socio...","[economy, society, outline, interpretive, soci...","[Max Weber, Guenther Roth (Editor), Claus Wi...",1.0,0.0,0.0,0.0
4,Socialism An Economic and Sociological Analysi...,"Ludwig von Mises, Friedrich A. Hayek (Foreword...",4.32,1269,59,This book must rank as the most devastating an...,596,1922,"[book, must, rank, devastating, analysis, soci...","[socialism, economic, sociological, analysis, ...","[Ludwig von Mises, Friedrich A. Hayek (Forewo...",1.0,0.0,0.0,0.0
5,I Am Woman: A Native Perspective on Sociology ...,Lee Maracle,4.33,617,54,I Am Woman represents my personal struggle wit...,146,1988,"[woman, represents, personal, struggle, womanh...","[woman, native, perspective, sociology, feminism]",[Lee Maracle],1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5444,Northern Australia: Task For A Nation,The Australian Institute of Political Science,0.00,0,0,In 1954 the empty spaces of Northern Australia...,247,1954,"[1954, empty, space, northern, australia, repr...","[northern, australia, task, nation]",[The Australian Institute of Political Science],2.0,0.0,0.0,0.0
5460,Hard Thinking: The Fusion of Politics & Science,Herbert E. Meyer,4.67,3,0,Presenting a culture of politics based on the ...,71,1993,"[presenting, culture, politics, based, latest,...","[hard, thinking, fusion, politics, science]",[Herbert E. Meyer],1.0,0.0,0.0,0.0
5462,Critical Political Ecology: The Politics of En...,Tim Forsyth,4.04,27,1,Critical Political Ecology brings political de...,336,2003,"[critical, political, ecology, brings, politic...","[critical, political, ecology, politics, envir...",[Tim Forsyth],1.0,0.0,0.0,0.0
5464,Trotskyism and the Dilemma of Socialism:,"Christopher Z. Hobson, Ronald D. Tabor",4.00,1,0,"Written by two long-time scholar/activists, th...",551,1988,"[written, two, longtime, scholaractivists, boo...","[trotskyism, dilemma, socialism]","[Christopher Z. Hobson, Ronald D. Tabor]",1.0,1.0,0.0,0.0


In [539]:
df_fin = df_fin.reset_index(drop=True)
df_fin

Unnamed: 0,Title,Author,Avg_ratings,nratings,nreviews,Description,Pages,Published_year,proc_de,proc_ti,names_column,Author1,Author2,Author3,Author4
0,Sanctorum Communio: A Theological Study of the...,"Dietrich Bonhoeffer, Clifford J. Green",3.96,2716,31,Sanctorum Communio was Bonhoeffer's dissertati...,392,1930,"[sanctorum, communio, bonhoeffer, dissertation...","[sanctorum, communio, theological, study, soci...","[Dietrich Bonhoeffer, Clifford J. Green]",1.0,1.0,0.0,0.0
1,The Social Construction of Reality: A Treatise...,"Peter L. Berger, Thomas Luckmann",4.12,3876,217,"Called the ""fifth-most important sociological ...",219,1966,"[called, fifthmost, important, sociological, b...","[social, construction, reality, treatise, soci...","[Peter L. Berger, Thomas Luckmann]",1.0,1.0,0.0,0.0
2,Economy and Society: An Outline of Interpretiv...,"Max Weber, Guenther Roth (Editor), Claus Witti...",4.12,1439,31,Max Weber's Economy and Society is the greates...,1469,1922,"[max, weber, economy, society, greatest, socio...","[economy, society, outline, interpretive, soci...","[Max Weber, Guenther Roth (Editor), Claus Wi...",1.0,0.0,0.0,0.0
3,Socialism An Economic and Sociological Analysi...,"Ludwig von Mises, Friedrich A. Hayek (Foreword...",4.32,1269,59,This book must rank as the most devastating an...,596,1922,"[book, must, rank, devastating, analysis, soci...","[socialism, economic, sociological, analysis, ...","[Ludwig von Mises, Friedrich A. Hayek (Forewo...",1.0,0.0,0.0,0.0
4,I Am Woman: A Native Perspective on Sociology ...,Lee Maracle,4.33,617,54,I Am Woman represents my personal struggle wit...,146,1988,"[woman, represents, personal, struggle, womanh...","[woman, native, perspective, sociology, feminism]",[Lee Maracle],1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3975,Northern Australia: Task For A Nation,The Australian Institute of Political Science,0.00,0,0,In 1954 the empty spaces of Northern Australia...,247,1954,"[1954, empty, space, northern, australia, repr...","[northern, australia, task, nation]",[The Australian Institute of Political Science],2.0,0.0,0.0,0.0
3976,Hard Thinking: The Fusion of Politics & Science,Herbert E. Meyer,4.67,3,0,Presenting a culture of politics based on the ...,71,1993,"[presenting, culture, politics, based, latest,...","[hard, thinking, fusion, politics, science]",[Herbert E. Meyer],1.0,0.0,0.0,0.0
3977,Critical Political Ecology: The Politics of En...,Tim Forsyth,4.04,27,1,Critical Political Ecology brings political de...,336,2003,"[critical, political, ecology, brings, politic...","[critical, political, ecology, politics, envir...",[Tim Forsyth],1.0,0.0,0.0,0.0
3978,Trotskyism and the Dilemma of Socialism:,"Christopher Z. Hobson, Ronald D. Tabor",4.00,1,0,"Written by two long-time scholar/activists, th...",551,1988,"[written, two, longtime, scholaractivists, boo...","[trotskyism, dilemma, socialism]","[Christopher Z. Hobson, Ronald D. Tabor]",1.0,1.0,0.0,0.0


In [543]:
df_fin = df_fin.drop(columns=['Title', 'Author', 'Description', 'names_column'])

In [545]:
df_fin.to_csv('data_processed.csv', index=False) #Saves the processed data as a csv-file.