## Get all the Sociologists and Anthropologist from Wiki

In [19]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import regex as re 
import json

In [20]:
def get_wiki(_page, get_txt = False):
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = f"titles={_page}"
    content = "prop=revisions&rvprop=content&rvslots=*"
    dataformat ="format=json"
    
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    print(query)
    
    if get_txt == True:
        resp = requests.get(query).json()
        page_id = [i for i in resp['query']['pages'].keys()][0] # get page id
        txt = resp['query']['pages'][page_id]['revisions'][0]['slots']['main']['*']
        return txt
    
    else:
        return requests.get(query).json()

In [21]:
resp_soc = get_wiki("List_of_sociologists")
resp_ant = get_wiki("List_of_anthropologists")
resp_pol = get_wiki("List_of_political_scientists")
resp_psy = get_wiki("List_of_psychologists")
resp_eco = get_wiki("List_of_economists")

In [22]:
''' Soc '''
txt = resp_soc['query']['pages']['254243']['revisions'][0]['slots']['main']['*']
txt = txt.split('==A==')[1] # drop intro tekst
txt = txt.split('==References==')[0] # drop trailing stuff
sociologists = re.findall('\n\*(?: )?\[\[(.*?)(?:\|.*?)?\]\]*',txt)
print(f'Sociologists: {len(sociologists)}')

''' Ant '''
txt = resp_ant['query']['pages']['728']['revisions'][0]['slots']['main']['*']
txt = txt.split('==A==')[1] # drop intro tekst
txt = txt.split('==Fictional anthropologists==')[0] # drop trailing stuff
anthropologists = re.findall('\n\*(?: )?\[\[(.*?)(?:\|.*?)?\]\]*',txt)
print(f'Anthropologist: {len(anthropologists)}')

''' Eco '''
txt = resp_eco["query"]['pages']['10231']['revisions'][0]['slots']['main']['*']
txt = txt.split('==A==')[1] # drop intro tekst
txt = txt.split('==See also==')[0] # drop trailing stuff
economists = re.findall('\n\*(?: )?\[\[(.*?)(?:\|.*?)?\]\]*',txt)
print(f'economists: {len(economists)}')

''' Psy '''
txt = resp_psy['query']['pages']['199877']['revisions'][0]['slots']['main']['*']
txt = txt.split('== A ==')[1] # drop intro tekst
txt = txt.split('==See also==')[0] # drop trailing stuff
psychologists = re.findall('\n\*(?: )?\[\[(.*?)(?:\|.*?)?\]\]*',txt)
print(f'Psychologists: {len(psychologists)}')

''' Pol '''
txt = resp_pol['query']['pages']['37559']['revisions'][0]['slots']['main']['*']
txt = txt.split('== A ==')[1] # drop intro tekst
txt = txt.split('== See also ==')[0] # drop trailing stuff
political_scientists = re.findall('\n\*(?: )?\[\[(.*?)(?:\|.*?)?\]\]*',txt)
print(f'Political_scientists: {len(political_scientists)}')

# To dict
science_dict = {'soc':sociologists,
                'anth':anthropologists,
                'eco': economists,
                'psy': psychologists,
                'pol': political_scientists}

with open('science_name_dict.json', 'w', encoding = 'utf-8') as f:
    json.dump(science_dict,f)

Sociologists: 782
Anthropologist: 277
economists: 1113
Psychologists: 465
Political_scientists: 564


## Get all the content from the Socs's/Anths's wiki pages

In [23]:
with open('science_name_dict.json', 'r', encoding = 'utf-8') as f:
    science_dict = json.load(f)

# Copy dict 
new_science_dict = science_dict.copy()

    
''' Scrape the pages '''
for field, nodes in science_dict.items():
    for node in nodes:
        # hvis der bliver henvist til en section på en side fx 'Karen_Horney#Ten_neurotic_needs'
        # også med spmtegn for den kan ikke gemme med spørgsmålstegn --> filnavnet er invalidt 
        if '#' in node or '?' in node: 
            print('Skipping: ', node)
            continue
            
        node = node.replace(' ', '_')
        try:
            txt = get_wiki(node, get_txt = True)
            
            if "#REDIRECT" in txt: # if it is a redirect
                new_science_dict[field].remove(node.replace('_', ' '))  # delete the old value 
                node = [re.findall('\[\[(.*?)(?:\|.*?)?\]\]', txt)[0]] # redirected link 
                print(f"## Redirect! {node}")
                new_science_dict[field] += node # append the new node to the science_dict 
                node = node[0].replace(' ', '_') # extract from the list and replace 
                
                if '#' in node or '?' in node: 
                    print('Skipping: ', node)
                    continue                    
                    
                txt = get_wiki(node, get_txt=True)
                
        except KeyError as e: # if the page is incomplete (red hyperlinks)
            print(e, node)
            continue
        
        with open(f'wiki_content/{node}.txt', 'w', encoding = 'utf-8') as f:
            f.write(txt)
            
# Gem den opdaterede dict 
with open('science_name_dict.json', 'w', encoding = 'utf-8') as f:
    json.dump(new_science_dict,f)

## Redirect! ['Theodor W. Adorno']
## Redirect! ['Aristotle']
## Redirect! ['Robert Bartholomew']
## Redirect! ['Frank D. Bean']
## Redirect! ['Charles Booth (social reformer)']
## Redirect! ['Ronald Stuart Burt']
## Redirect! ['Antonio Caso Andrade']
## Redirect! ['F. Stuart Chapin']
## Redirect! ['Nicholas Christakis']
## Redirect! ['Marquis de Condorcet']
## Redirect! ['Raewyn Connell']
## Redirect! ['Anna J. Cooper']
## Redirect! ['Gøsta Esping-Andersen']
## Redirect! ['Claude S. Fischer']
## Redirect! ['Herbert J. Gans']
## Redirect! ['David W. Garland']
## Redirect! ['John Goldthorpe']
## Redirect! ['Andrew Greeley']
## Redirect! ['David Harvey']
## Redirect! ['Leonard Hobhouse']
## Redirect! ['José Ingenieros']
## Redirect! ['Aleksandr Kapto']
## Redirect! ['A. L. Kroeber']
## Redirect! ['Thomas Kuhn']
## Redirect! ['Michèle Lamont']
## Redirect! ['Pyotr Lavrov']
## Redirect! ['Paul_Lazarsfeld']
## Redirect! ['Pierre Guillaume Frédéric le Play']
## Redirect! ['Henry James Sumner

In [18]:
for field, nodes in new_science_dict.items():
    for node in nodes:
        node = node.replace(' ', '_')
        try:
            with open(f'wiki_txt/{node}.txt', 'r', encoding = 'utf-8') as f:
                f.write(txt)
        except:
            print(node)

Peter_Abell
Mark_Abrams
Janet_Abu-Lughod
Jane_Addams
Richard_Alba
Francesco_Alberoni
Martin_Albrow
Jeffrey_C._Alexander
Edwin_Amenta
Nancy_Ammerman
Eric_Anderson_(sociologist)
Elijah_Anderson
Stanislav_Andreski
Aaron_Antonovsky
Arjun_Appadurai
Andrew_Arato
Margaret_Archer
Hannah_Arendt
Alcira_Argumedo
Raymond_Aron
Stanley_Aronowitz
Giovanni_Arrighi
Johan_Asplund
Vilhelm_Aubert
Francisco_Ayala_(novelist)
Élisabeth_Badinter
Patrick_Baert
Sergio_Bagú
Kenneth_D._Bailey_(sociologist)
Georges_Balandier
Emily_Greene_Balch
Robert_Balch
E._Digby_Baltzell
Eileen_Barker
S._Barry_Barnes
Liberty_Barnes
Roland_Barthes
Roger_Bastide
Gregory_Bateson
Jean_Baubérot
Jean_Baudrillard
Zygmunt_Bauman
Peter_Bearman
Ulrich_Beck
Gary_Becker
Howard_P._Becker
Howard_S._Becker
Jens_Beckert
Richard_Fritz_Behrendt
Daniel_Bell
Robert_N._Bellah
Walden_Bello
Reinhard_Bendix
Walter_Benjamin
Albert_Benschop
Joseph_Berger_(sociologist)
Peter_L._Berger
Pierre_L._van_den_Berghe
Henri_Bergson
Jessie_Bernard
Eduard_Bernstein

John_Rex
James_Mahmud_Rice
Sam_Richards_(sociologist)
David_Riesman
George_Ritzer
Roland_Robertson
William_I._Robinson
Terje_Rød-Larsen
Jesús_M._de_Miguel_Rodríguez
Arnold_Marshall_Rose
Gillian_Rose
Nikolas_Rose
Paul_Rosenfels
Eugen_Rosenstock-Huessy
Jean-Jacques_Rousseau
Rubén_G._Rumbaut
Arne_Runeberg
Harvey_Sacks
Renaud_Sainsaulieu
Robert_J._Sampson
Pierre_Sansot
Boaventura_de_Sousa_Santos
Giovanni_Sartori
Saskia_Sassen
Peter_Robert_Saunders
Ferdinand_de_Saussure
Albert_Schäffle
Thomas_J._Scheff
Emanuel_Schegloff
Max_Scheler
Helmut_Schelsky
Juraj_Schenk
Herbert_Schiller
Kurt_C._Schlichting
Wolfgang_Schluchter
Paul_Schnabel
Allan_Schnaiberg
Juliet_Schor
Alfred_Schütz
Michael_Schwartz_(sociologist)
John_Scott_(sociologist)
Jean_Séguy
Steven_Seidman
Pınar_Selek
Philip_Selznick
Amartya_Sen
Richard_Sennett
William_H._Sewell
Steven_Shapin
Jeremy_J._Shapiro
Ali_Shariati
Tamotsu_Shibutani
Edward_Shils
Volkmar_Sigusch
Charles_E._Silberman
François_Simiand
Georg_Simmel
Herbert_A._Simon
Theda_S

Alfred_Radcliffe-Brown
Benjamin_Lee_Whorf
Edith_Abbott
Daron_Acemoglu
Nicola_Acocella
Zoltan_Acs
Henry_Carter_Adams
Walter_Adams_(economist)
Philippe_Aghion
Montek_Singh_Ahluwalia
Qazi_Kholiquzzaman_Ahmad
George_Akerlof
Armen_Alchian
Alberto_Alesina
Sidney_S._Alexander
Maurice_Allais
Franklin_Allen
R._G._D._Allen
Gar_Alperovitz
Lee_J._Alston
Elisabeth_Altmann-Gottheiner
Fernando_Alvarez_(economist)
B._R._Ambedkar
Takeshi_Amemiya
Georges_Anderla
Donald_Andrews
George-Marios_Angeletos
Norman_Angell
Joshua_Angrist
Kofi_Annan
Masahiko_Aoki
Thomas_Aquinas
Luis_Arce
Pérsio_Arida
Dan_Ariely
Heinz_Arndt
Kenneth_Arrow
Gloria_Macapagal_Arroyo
Enrique_R._Arzac
Orley_Ashenfelter
William_Ashley_(economic_historian)
Cliff_Asness
Jeremy_Atack
Susan_Athey
Orazio_Attanasio
Thomas_Attwood_(economist)
David_B._Audretsch
Leonardo_Auernheimer
Robert_Aumann
George_Ayittey
Ali_Babacan
Roger_Babson
Louis_Bachelier
Roger_Backhouse_(economist)
Walter_Bagehot
Nikolai_Baibakov
Dean_Baker
E._Wight_Bakke
Mikhail_Ba

David_Laibson
David_Laidler
John_A._Laitner
Naomi_Lamoreaux
Steven_Landsburg
Philip_R._Lane
Oskar_Lange
Serge_Latouche
John_Law_(economist)
Edward_Lazear
Edward_E._Leamer
Stanley_Lebergott
Lewis_Lehrman
Frederic_Sterling_Lee
Peter_Leeson
Axel_Leijonhufvud
Manuela_Ferreira_Leite
Leonard_Liggio
Wassily_Leontief
Abba_P._Lerner
Leonardus_Lessius
David_K._Levine
Lars_Lefgren
Arthur_Lewbel
Tracy_R._Lewis
Kevin_Leyton-Brown
Evsei_Liberman
Justin_Yifu_Lin
Michael_Lind
Erik_Lindahl
Assar_Lindbeck
Friedrich_List
John_A._List
Andrew_Lo
John_Locke
William_Forster_Lloyd
Bernard_Lonergan
Frédéric_Lordon
Max_O._Lorenz
Pascal_Lorot
Andreas_Löschel
John_Lott
Stephen_J._Luczo
Rosa_Luxemburg
Gerard_Lyons
Donald_MacDougall
Mark_J._Machina
Carlos_Manuel_Urzúa_Macías
Henry_Dunning_Macleod
Adil_Abdul-Mahdi
Edmond_Malinvaud
Burton_Malkiel
Gerard_de_Malynes
Henry_Manne
Alan_Manning
Edwin_Mansfield
Charles_Manski
Mao_Yushi
Harry_Markowitz
Karl_Marlo
Jacob_Marschak
Alfred_Marshall
Marsh_Marshall
Xavier_Sala-i-Ma

Xenophon
Menahem_Yaari
Basil_Yamey
Xiaokai_Yang
Janet_Yellen
Allyn_Abbott_Young
Alwyn_Young
Peyton_Young
Yu_Guangyuan
Linda_Yueh
Muhammad_Yunus
Richard_Zeckhauser
Zeine_Ould_Zeidane
Milan_Zeleny
Arnold_Zellner
Yves_Zenou
Zhang_Weiying
Zhou_Xiaochuan
Jeffrey_Zients
Fabrizio_Zilibotti
Luigi_Zingales
Xenophon_Zolotas
Sadie_Tanner_Mossell_Alexander
Tony_Atkinson
Clarence_Edwin_Ayres
Joe_Bain
Eugen_von_Böhm-Bawerk
Albert_Bergstrom
Peter_Boettke
George_J._Borjas
William_L._Breit
Edward_Chamberlin
Alfred_D._Chandler_Jr.
Alfred_H._Conrad
William_A._Darity_Jr.
Hernando_de_Soto_(economist)
Simeon_Dyankov
Steven_Durlauf
Sebastián_Edwards
Ragnar_Frisch
Lev_Gatovsky
María_del_Carmen_Guisán
Steve_Hanke
J._A._Hobson
Sri_Mulyani
Timothy_Kehoe
Target_page_name
Larry_Kudlow
Domingo_Laíno
Larry_Hsien_Ping_Lang
Richard_Layard,_Baron_Layard
Rick_Levin
Steven_Levitt
W._Arthur_Lewis
Robert_Lucas_Jr.
Thomas_Robert_Malthus
Greg_Mankiw
Richard_J._Maybury
John_Ramsay_McCulloch
David_McWilliams_(economist)
Valery

Vaira_Vīķe-Freiberga
Hedwig_von_Restorff
Lev_Vygotsky
Stuart_Vyse
Joan_Scott_Wallace
Henri_Wallon_(psychologist)
Hans-Jürgen_Walter
Margaret_Floy_Washburn
John_B._Watson
Paul_Watzlawick
Ernst_Heinrich_Weber
David_Wechsler
Nicole_Weekes
Karl_E._Weick
Robert_Weimar
Max_Wertheimer
Drew_Westen
Michael_White_(psychotherapist)
Ken_Wilber
Glenn_Wilson_(psychologist)
Richard_Wiseman
Władysław_Witwicki
Gustav_Adolf_Wohlgemuth
Robert_S._Woodworth
Helen_Thompson_Woolley
Wilhelm_Wundt
Karen_Wynn
Fei_Xu
Robert_Yerkes
Irvin_D._Yalom
Robert_Zajonc
Oliver_Zangwill
René_Zazzo
Bluma_Zeigarnik
Philip_Zimbardo
Kenneth_Zucker
Adelbert_Ames_Jr.
Renée_Baillargeon
Kelly_D._Brownell
Catharine_Cox_Miles
Daniel_David
Dietrich_Dörner
Albert_Ellis
László_Garai
Kenneth_J._Gergen
James_J._Gibson
Irving_Gottesman
Donald_O._Hebb
Ásgeir_Helgason
Alan_E._Kazdin
Elisabeth_Kübler-Ross
Bibb_Latané
Masters_and_Johnson
David_McNeill
Paul_E._Meehl
Andrew_N._Meltzoff
John_Morton_(cognitive_scientist)
Don_Norman
Lorine_Pruette


Donna_Shalala
Jim_Sidanius
Yekaterina_Shulman
Herbert_A._Simon
Valeria_Sinclair-Chapman
Theda_Skocpol
Stephen_Skowronek
Anne-Marie_Slaughter
Jean_Edward_Smith
Rogers_Smith
Steven_S._Smith
Peverill_Squire
Allison_Stanger
Michael_Steed
Alfred_Stepan
Zeev_Sternhell
John_G._Stoessinger
Donald_E._Stokes
Susan_Stokes
Herbert_Storing
Susan_Strange
Dara_Strolovitch
Rein_Taagepera
Colin_Talbot
Marco_Tarchi
Katherine_Tate
Sally_Terry
Marianne_Thyrring
J._Ann_Tickner
Virginia_Tilley
Charles_Tilly
Herbert_Tingsten
Jeanne_Theoharis
Reeta_Chowdhari_Tremblay
George_Tsebelis
Patrick_Utomi
Stephen_Van_Evera
Tatu_Vanhanen
Sarojini_Varadappan
Sidney_Verba
Mieke_Verloo
Eric_Voegelin
Margaret_Vogt
Leah_Vosko
Helen_Wallace
Denise_Walsh
Stephen_Walt
Michael_Walzer
John_Wanna
Georgina_Waylen
Linda_Weiss
Patricia_A._Weitsman
S._Laurel_Weldon
Alexander_Wendt
Darrell_M._West
John_Henry_Whyte
Aaron_Wildavsky
Bruce_A._Williams
Danny_Williams_(politician)
James_Q._Wilson
Woodrow_Wilson
William_Wohlforth
Arnold_Wolf

In [16]:
l = []
for i,x in new_science_dict.items():
    l.append(len(x))
sum(l)

3201

## Scrape af Sociological theories (Asger rod)

In [15]:
resp_soc_t = get_wiki("Category:Sociological_theories")

https://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&rvslots=*&titles=Category:Sociological_theories&format=json


In [12]:
''' soc_t '''
txt = resp_soc_t['query']['pages']['6220146']['revisions'][0]['slots']['main']['*']
txt = txt.split('==A==')[1] # drop intro tekst
txt = txt.split('==Fictional anthropologists==')[0] # drop trailing stuff
soc_theories = re.findall('\[\[(.*?)(?:\|.*?)?\]\]',txt)
print(f'soc_theories: {len(soc_theories)}')

IndexError: list index out of range

In [None]:
baseurl = "https://en.wikipedia.org/w/api.php?"
action = "action=query"
title = f"titles={_page}"
content = "prop=revisions&rvprop=content&rvslots=*"
dataformat ="format=json"

query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
print(query)

if get_txt == True:
    resp = requests.get(query).json()
    page_id = [i for i in resp['query']['pages'].keys()][0] # get page id
    txt = resp['query']['pages'][page_id]['revisions'][0]['slots']['main']['*']
    return txt

else:
    return requests.get(query).json()