In [1]:
import json
import os
import pysolr
import requests
import pandas as pd

### Commands to start Solr

- sudo systemctl start solr
- sudo systemctl stop solr
- sudo systemctl status solr

### Reading the JSON file

In [2]:
with open("./wiki_documents_scraped_preprocessed.json", "r") as json_file:
    topic_doc_dict = json.load(json_file)

In [3]:
def is_alphanumeric(text):
    # Remove any whitespace from the text
    cleaned_text = text.replace(" ", "")
    
    return cleaned_text.isalnum()

In [4]:
def test_alphanumeric(topic_documents):
    alphanumeric_doc_count = 0
    count = 0
    for key in topic_documents.keys():
        for document_info in topic_documents[key]:
            if is_alphanumeric(document_info["summary"]):
                alphanumeric_doc_count += 1
            else:
                count += 1
                if count % 10 == 0:
                    print("******* BREAKING PREMPTIVELY ********")
    
    # Print the count of alphanumeric stats
    print(f"Number, Percentage of alphanumeric summaries: {alphanumeric_doc_count, (alphanumeric_doc_count/6000)*100}")

#### Test count for alphanumeric documents (Total docs: 6000)

In [5]:
test_alphanumeric(topic_doc_dict)

Number, Percentage of alphanumeric summaries: (5999, 99.98333333333333)


### Creating a collection (basically a list of dictionaries)

In [6]:
combined_collection = []

for key, value in topic_doc_dict.items():
    for doc_info in value:
        combined_collection.append(doc_info)
        
print("Total documents in the collection: ", len(combined_collection))

Total documents in the collection:  6000


In [7]:
combined_collection[0]

{'revision_id': 1166952709,
 'title': 'Common disease-common variant',
 'summary': 'The common disease common variant  often abbreviated CD CV  hypothesis predicts that common disease causing alleles  or variants  will be found in all human populations which manifest a given disease  Common variants  not necessarily disease causing  are known to exist in coding and regulatory sequences of genes  According to the CD CV hypothesis  some of those variants lead to susceptibility to complex polygenic diseases  Each variant at each gene influencing a complex disease will have a small additive or multiplicative effect on the disease phenotype  These diseases  or traits  are evolutionarily neutral in part because so many genes influence the traits  The hypothesis has held  in the case of putative causal variants in apolipoprotein E  including APOE ε4  associated with Alzheimer s disease  IL23R has been found to be associated with Crohn s disease  the at risk allele has a frequency of 93  in th

In [8]:
valid_summary_count = 0
for doc_info in combined_collection:
    if len(doc_info['summary']) > 200:
        valid_summary_count += 1

print(valid_summary_count)

6000


## Code to index the code to Solr

In [8]:
CORE_NAME = "IRF23P1"
VM_IP = "localhost"

In [9]:
def delete_core(core=CORE_NAME):
    print(os.system('sudo su - solr -c "/opt/solr/bin/solr delete -c {core}"'.format(core=core)))


def create_core(core=CORE_NAME):
    print(os.system(
        'sudo su - solr -c "/opt/solr/bin/solr create -c {core} -n data_driven_schema_configs"'.format(
            core=core)))

In [10]:
class Indexer:
    def __init__(self):
        self.solr_url = f'http://{VM_IP}:8983/solr/'
        self.connection = pysolr.Solr(self.solr_url + CORE_NAME, always_commit=True, timeout=5000000)

    def do_initial_setup(self):
        delete_core()
        create_core()

    def create_documents(self, docs):
        print(self.connection.add(docs))

    def add_fields(self):
        data = {
                "add-field": [
                    {
                        "name": "revision_id",
                        "type": "string",
                        "indexed": True,
                        "multiValued": False
                    },
                    {
                        "name": "title",
                        "type": "string",
                        "multiValued": False
                    },
                    {
                        "name": "summary",
                        "type": "text_en",
                        "multiValued": False
                    },
                    {
                        "name": "url",
                        "type": "string",
                        "multiValued": False
                    },
                    {
                        "name": "topic",
                        "type": "string",
                        "multiValued": False
                    }
                ]
               }

        print(requests.post(self.solr_url + CORE_NAME + "/schema", json=data).json())

In [11]:
i = Indexer()
i.do_initial_setup()
i.add_fields()


Deleting core 'IRF23P1' using command:
http://localhost:8983/solr/admin/cores?action=UNLOAD&core=IRF23P1&deleteIndex=true&deleteDataDir=true&deleteInstanceDir=true

0

Created new core 'IRF23P1'
0
{'responseHeader': {'status': 0, 'QTime': 410}}


In [12]:
i.create_documents(combined_collection)

{
  "responseHeader":{
    "status":0,
    "QTime":4118}}

