In [1]:
import wikipediaapi
import pandas as pd
import os
import openai
import docx

## 1.1. Downloading Articles from Wiki regarding Tasks of Natural Language Processing

In [2]:


# wiki_wiki = wikipediaapi.Wikipedia(
#         language='en',
#         extract_format=wikipediaapi.ExtractFormat.WIKI
# )

# def get_category_text(categorymembers):
#     terms = []
#     sections = []
#     texts = []
#     for c in categorymembers.values():
#         term = c.title
#         try:
#             page = wiki_wiki.page(c.title)
#             for sec in page.sections:
#                 section_title = sec.title
#                 section_text = sec.text
#                 terms.append(term)
#                 sections.append(section_title)
#                 texts.append(section_text)
#         except:
#             continue
#         if c.ns == wikipediaapi.Namespace.CATEGORY:
#             nested_terms, nested_sections, nested_texts = get_category_text(c.categorymembers)
#             terms.extend(nested_terms)
#             sections.extend(nested_sections)
#             texts.extend(nested_texts)
#     return terms, sections, texts

# cat = wiki_wiki.page("Category:Tasks of natural language processing")
# terms, sections, texts = get_category_text(cat.categorymembers)
# df = pd.DataFrame({'article': terms, 'section': sections, 'text': texts})


In [3]:
wiki_wiki = wikipediaapi.Wikipedia(
        language='en',
        extract_format=wikipediaapi.ExtractFormat.WIKI
)

def get_category_text(categorymembers):
    terms = []
    sections = []
    texts = []
    for c in categorymembers.values():
        term = c.title
        try:
            page = wiki_wiki.page(c.title)
            intro_text = page.text[0:page.text.index("\n\n")]
            terms.append(term)
            sections.append("intro")
            texts.append(intro_text)
            for sec in page.sections:
                section_title = sec.title
                section_text = sec.text
                terms.append(term)
                sections.append(section_title)
                texts.append(section_text)
        except:
            continue
        if c.ns == wikipediaapi.Namespace.CATEGORY:
            nested_terms, nested_sections, nested_texts = get_category_text(c.categorymembers)
            terms.extend(nested_terms)
            sections.extend(nested_sections)
            texts.extend(nested_texts)
    return terms, sections, texts

cat = wiki_wiki.page("Category:Tasks of natural language processing")
terms, sections, texts = get_category_text(cat.categorymembers)
df = pd.DataFrame({'article': terms, 'section': sections, 'text': texts})


In [4]:
df

Unnamed: 0,article,section,text
0,Anaphora (linguistics),intro,"In linguistics, anaphora () is the use of an e..."
1,Anaphora (linguistics),Nomenclature and examples,The term anaphora is actually used in two ways...
2,Anaphora (linguistics),In generative grammar,The term anaphor is used in a special way in t...
3,Anaphora (linguistics),Complement anaphora,"In some cases, anaphora may refer not to its u..."
4,Anaphora (linguistics),Anaphora resolution – centering theory,There are many theories that attempt to prove ...
...,...,...,...
188,Textual entailment,External links,Textual Entailment Resource Pool
189,Truecasing,intro,"Truecasing, also called capitalization recover..."
190,Truecasing,Techniques,Neural network models that operate at the word...
191,Truecasing,Applications,"Truecasing aids in other NLP tasks, such as na..."


In [5]:
df.article.unique()

array(['Anaphora (linguistics)', 'Automated essay scoring',
       'Automatic hyperlinking', 'Automatic summarization',
       'CLAWS (linguistics)', 'Collocation extraction', 'Entity linking',
       'Google Neural Machine Translation', 'Language identification',
       'Lemmatisation', 'Linguistic empathy', 'Machine translation',
       'Mobile translation',
       'Name resolution (semantics and text extraction)',
       'Named-entity recognition', 'Neural machine translation',
       'Open information extraction', 'Part-of-speech tagging',
       'Phrase chunking', 'Question answering', 'Relationship extraction',
       'Résumé parsing', 'Semantic parsing', 'Semantic role labeling',
       'Sentence boundary disambiguation', 'Shallow parsing', 'Stemming',
       'Terminology extraction', 'Text segmentation',
       'Text simplification', 'Textual entailment', 'Truecasing'],
      dtype=object)

In [6]:
df.loc[(df["article"]=="Anaphora (linguistics)")&(df["section"]=="intro")].text.values

array(['In linguistics, anaphora () is the use of an expression whose interpretation depends upon another expression in context (its antecedent or postcedent). In a narrower sense, anaphora is the use of an expression that depends specifically upon an antecedent expression and thus is contrasted with cataphora, which is the use of an expression that depends upon a postcedent expression. The anaphoric (referring) term is called an anaphor. For example, in the sentence Sally arrived, but nobody saw her, the pronoun her is an anaphor, referring back to the antecedent Sally. In the sentence Before her arrival, nobody saw Sally, the pronoun her refers forward to the postcedent Sally, so her is now a cataphor (and an anaphor in the broader, but not the narrower, sense). Usually, an anaphoric expression is a pro-form or some other kind of deictic (contextually dependent) expression. Both anaphora and cataphora are species of endophora, referring to something mentioned elsewhere in a dialog or

## 1.2. Removing Irrelevant Sections

In [7]:

discard_categories = ['See also', 'References', 'External links', 'Further reading', "Footnotes",
    "Bibliography", "Sources", "Citations", "Literature", "Footnotes", "Notes and references",
    "Photo gallery", "Works cited", "Photos", "Gallery", "Notes", "References and sources",
    "References and notes", "External links and suggested reading"]

df['token_count'] = df.text.str.replace(',','').str.split().str.len()

df = df[~df['section'].isin(discard_categories)]
df=df.loc[df["token_count"]>20]
df=df.loc[df["token_count"]<800] #Removing lengthy sections due to the the model's limitations
df['context'] = df.article + "\n" + df.section + "\n\n" + df.text

## 1.3. Logging to OpenAI's API

In [8]:
def read_api_credentials_from_word(file_path):
    # Open the Word document using the python-docx library
    doc = docx.Document(file_path)
    # Initialize empty dictionaries to store the credentials
    api_credentials = {}
    # Iterate through all the paragraphs in the document
    for para in doc.paragraphs:
        # Split the paragraph text into words
        words = para.text.split()
        # Check if the first word is "openai.organization"
        if words[0] == "openai.organization":
            # Store the organization name in the api_credentials dictionary
            api_credentials["organization"] = words[2]
        # Check if the first word is "openai.api_key"
        if words[0] == "openai.api_key":
            # Store the api key in the api_credentials dictionary
            api_credentials["api_key"] = words[2]
    # Return the api_credentials dictionary
    return api_credentials

# Example usage
file_path = "/Users/daneran/credentials.docx"
api_credentials = read_api_credentials_from_word(file_path)

openai.organization= api_credentials["organization"]
openai.api_key = api_credentials["api_key"]
openai.Model.list()

<OpenAIObject list at 0x7fc032807dd0> JSON: {
  "data": [
    {
      "created": 1649358449,
      "id": "babbage",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
          "allow_sampling": true,
          "allow_search_indices": false,
          "allow_view": true,
          "created": 1669085501,
          "group": null,
          "id": "modelperm-49FUp5v084tBB49tC4z8LPH5",
          "is_blocking": false,
          "object": "model_permission",
          "organization": "*"
        }
      ],
      "root": "babbage"
    },
    {
      "created": 1649357491,
      "id": "ada",
      "object": "model",
      "owned_by": "openai",
      "parent": null,
      "permission": [
        {
          "allow_create_engine": false,
          "allow_fine_tuning": false,
          "allow_logprobs": true,
          "allow_sampli

## 1.4. Generating Questions

In [9]:
def get_questions(context):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write questions based on the text below\n\nText: {context}\n\nQuestions:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n\n"]
        )
        return response['choices'][0]['text']
    except:
        return ""


df['questions'] = df.context.apply(get_questions)
df['questions'] = "1." + df.questions

In [10]:
df.questions.values

array(['1. What is anaphora?\n2. What is the difference between anaphora and cataphora?\n3. What are some examples of anaphora?\n4. Why is anaphora important?',
       '1. What is an anaphor?\n2. What is the traditional binding theory?\n3. What is the syntactic relationship between a pro-form and its antecedent?',
       '1. What is anaphora?\n2. What is complement anaphora?\n3. What are the different referents in complement anaphora?\n4. What is the significance of resolving complement anaphora?',
       '1. What is anaphora resolution?\n2. What is centering theory?\n3. What are the different types of centers in centering theory?\n4. What are the constraints on antecedents in centering theory?',
       '1. What is automated essay scoring?\n2. What factors have contributed to the growing interest in automated essay scoring?\n3. What are the benefits of automated essay scoring?\n4. What are the drawbacks of automated essay scoring?',
       '1. What is the history of automated essay sco

## 1.5. Generating Answers 

In [11]:
def get_answers(row):
    try:
        response = openai.Completion.create(
            engine="davinci-instruct-beta-v3",
            prompt=f"Write answers based on the text below\n\nText: {row.context}\n\nQuestions:\n{row.questions}\n\nAnswers:\n1.",
            temperature=0,
            max_tokens=257,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0
        )
        return response['choices'][0]['text']
    except Exception as e:
        print (e)
        return ""


df['answers']= df.apply(get_answers, axis=1)
df['answers'] = "1." + df.answers
df = df.dropna().reset_index().drop('index',axis=1)


In [12]:
df.article.unique()

array(['Anaphora (linguistics)', 'Automated essay scoring',
       'Automatic hyperlinking', 'Automatic summarization',
       'CLAWS (linguistics)', 'Collocation extraction', 'Entity linking',
       'Google Neural Machine Translation', 'Language identification',
       'Lemmatisation', 'Linguistic empathy', 'Machine translation',
       'Mobile translation',
       'Name resolution (semantics and text extraction)',
       'Named-entity recognition', 'Neural machine translation',
       'Open information extraction', 'Part-of-speech tagging',
       'Phrase chunking', 'Question answering', 'Relationship extraction',
       'Résumé parsing', 'Semantic parsing', 'Semantic role labeling',
       'Sentence boundary disambiguation', 'Shallow parsing', 'Stemming',
       'Terminology extraction', 'Text segmentation',
       'Text simplification', 'Textual entailment', 'Truecasing'],
      dtype=object)

## 1.6. An Example

In [18]:
df.loc[(df["article"]=="Named-entity recognition")]

Unnamed: 0,article,section,text,token_count,context,questions,answers
52,Named-entity recognition,intro,Named-entity recognition (NER) (also known as ...,69,Named-entity recognition\nintro\n\nNamed-entit...,1. What is the purpose of named-entity recogni...,1. Named-entity recognition is a subtask of in...
53,Named-entity recognition,Named-entity recognition platforms,Notable NER platforms include:\n\nGATE support...,44,Named-entity recognition\nNamed-entity recogni...,1. What is a named-entity recognition platform...,1. A named-entity recognition platform is a so...
54,Named-entity recognition,Problem definition,"In the expression named entity, the word named...",445,Named-entity recognition\nProblem definition\n...,1. What is the difference between a name and a...,1. A name is a string of tokens that stands (f...
55,Named-entity recognition,Approaches,NER systems have been created that use linguis...,84,Named-entity recognition\nApproaches\n\nNER sy...,1. What are the different types of NER systems...,1. There are different types of NER systems th...
56,Named-entity recognition,Problem domains,"In 2001, research indicated that even state-of...",176,Named-entity recognition\nProblem domains\n\nI...,1. What is the main problem with NER systems?\...,1. The main problem with NER systems is that t...
57,Named-entity recognition,Current challenges and research,Despite high F1 numbers reported on the MUC-7 ...,238,Named-entity recognition\nCurrent challenges a...,1. What are the main efforts in named-entity r...,1. The main efforts in named-entity recognitio...


In [20]:
df.loc[(df["article"]=="Named-entity recognition")&(df["section"]=="Approaches")].text.values

array(['NER systems have been created that use linguistic grammar-based techniques as well as statistical models such as machine learning. Hand-crafted grammar-based systems typically obtain better precision, but at the cost of lower recall and months of work by experienced computational linguists. Statistical NER systems typically require a large amount of manually annotated training data. Semisupervised approaches have been suggested to avoid part of the annotation effort.Many different classifier types have been used to perform machine-learned NER, with conditional random fields being a typical choice.'],
      dtype=object)

In [21]:
df.loc[(df["article"]=="Named-entity recognition")&(df["section"]=="Approaches")].questions.values

array(['1. What are the different types of NER systems?\n2. What are the benefits of using a grammar-based system?\n3. What is the typical choice for a classifier type when performing machine-learned NER?'],
      dtype=object)

In [22]:
df.loc[(df["article"]=="Named-entity recognition")&(df["section"]=="Approaches")].answers.values

array(['1. There are different types of NER systems that use linguistic grammar-based techniques and statistical models.\n2. The benefits of using a grammar-based system are that they typically obtain better precision.\n3. The typical choice for a classifier type when performing machine-learned NER is conditional random fields.'],
      dtype=object)