In [1]:
!which python

/home/aikochou/.conda/envs/knowledge_gaps/bin/python


In [2]:
from wmfdata.spark import get_session
spark = get_session(type='regular')

%load_ext autoreload
%autoreload 2

repo='/home/aikochou/research-ml'
import sys
sys.path.append(f"{repo}/research-transform")

PySpark executors will use /usr/lib/anaconda-wmf/bin/python3.


In [3]:
from research_transform.knowledge_gaps import content_gaps
import pyspark.sql.functions as F

mediawiki_snapshot = '2021-09'
wikidata_snapshot = '2021-10-04'

In [9]:
def add_label(spark, input_df, col, wikidata_snapshot):
    input_df.createOrReplaceTempView('input_df')
    query = f"""
        SELECT input_df.*, label_val AS gender_label
        FROM input_df
        LEFT JOIN wmf.wikidata_entity we
        ON input_df.{col} = we.id
        LATERAL VIEW explode(labels) t AS label_lang, label_val
        WHERE t.label_lang='en'
        AND typ='item'
        AND we.snapshot='{wikidata_snapshot}'
    """
    df = spark.sql(query)
    return df

In [5]:
wikidata_qitems = content_gaps.wikidata_qitems_df(spark, wikidata_snapshot) # around ~94M items
wikidata_properties = content_gaps.wikidata_properties(spark, wikidata_snapshot) # around ~1338M property-value pairs

In [6]:
wikidata_properties.cache()

DataFrame[qitem_id: string, property: string, value: string]

In [7]:
wikidata_qitems = content_gaps.append_is_human(wikidata_qitems, wikidata_properties)
wikidata_qitems = content_gaps.append_gender(wikidata_qitems, wikidata_properties)
wikidata_qitems = content_gaps.append_sexual_orientation(wikidata_qitems, wikidata_properties)
wikidata_qitems.printSchema()

root
 |-- qitem_id: string (nullable = true)
 |-- is_human: boolean (nullable = false)
 |-- gender: string (nullable = true)
 |-- sexual_orientation: string (nullable = true)



In [8]:
wikidata_qitems.cache()

DataFrame[qitem_id: string, is_human: boolean, gender: string, sexual_orientation: string]

In [10]:
biographies = wikidata_qitems.where(F.col('is_human') == True)

In [13]:
biographies.count() # ~9M

9372956

In [12]:
biographies.cache()

DataFrame[qitem_id: string, is_human: boolean, gender: string, sexual_orientation: string]

In [11]:
biographies.where(F.col('gender').isNull()).count() # ~1.9M, most of human qtiems have gender

1954873

In [14]:
biographies.where(F.col('sexual_orientation').isNull()).count() # ~9M, only ~3k have sexual_orientation

9369809

In [15]:
gender_categories = biographies.groupby('gender').count()
gender_categories = add_label(spark, gender_categories, 'gender', wikidata_snapshot)
gender_categories.cache()

DataFrame[gender: string, count: bigint, gender_label: string]

In [16]:
gender_categories = gender_categories.toPandas()

In [17]:
gender_categories.sort_values('count', ascending=False) # human qitems

Unnamed: 0,gender,count,gender_label
21,Q6581097,5659589,"""male"""
27,Q6581072,1756388,"""female"""
26,Q1052281,1046,"""transgender female"""
23,Q48270,428,"""non-binary"""
36,Q2449503,265,"""transgender male"""
2,Q179294,129,"""eunuch"""
34,Q1097630,72,"""intersex"""
9,Q18116794,32,"""genderfluid"""
19,Q12964198,27,"""genderqueer"""
32,Q15145779,14,"""cisgender female"""


In [19]:
sexual_orientation_categories = biographies.groupby('sexual_orientation').count()
sexual_orientation_categories = add_label(spark, sexual_orientation_categories, 'sexual_orientation', wikidata_snapshot)
sexual_orientation_categories.cache()

DataFrame[sexual_orientation: string, count: bigint, gender_label: string]

In [20]:
sexual_orientation_categories = sexual_orientation_categories.toPandas()
sexual_orientation_categories.sort_values('count', ascending=False)

Unnamed: 0,sexual_orientation,count,gender_label
11,Q6636,1010,"""homosexuality"""
12,Q6649,801,"""lesbianism"""
9,Q43200,576,"""bisexuality"""
7,Q592,347,"""gay"""
1,Q339014,193,"""non-heterosexuality"""
10,Q1035954,111,"""heterosexuality"""
13,Q271534,60,"""pansexuality"""
2,Q724351,33,"""asexuality"""
0,Q23912283,7,"""demisexuality"""
14,Q20011275,3,"""sapiosexuality"""
