In [1]:
import os
from elasticsearch import Elasticsearch, helpers
import json

In [2]:
client = Elasticsearch("http://localhost:9200", \
                       basic_auth=("elastic", "sD5F3szZ"))  

In [3]:

# with open('answers/q1.json', 'w') as f:
#     json.dump(dict(response), f, indent=4)

In [4]:
#q1
health = client.cluster.health()

nested_lists = [[key, value] for key, value in health.items()]

pairs = nested_lists[:5]

with open('answers/q1.json', 'w') as f:
    json.dump(dict(pairs), f, indent=4)

In [6]:
#q2

index_name = "madmap"
data_folder = "data/jsons/"
try:
    client.indices.delete(index=index_name)
except Exception as e:
    print(f"Index '{index_name}' does not exist or could not be deleted: {e}")
# Define the index name

client.indices.create(index=index_name)

json_files = [f for f in os.listdir(data_folder) if f.endswith(".json") and f != "places.json"]


operations = []

for json_file in json_files:
    file_path = os.path.join(data_folder, json_file)
    with open(file_path, "r") as f:
        data = json.load(f)
        source = list(data.keys())[0]
        for doc in data[source]:
            operations.append({
                "_index": index_name,
                "_source": doc
            })    

helpers.bulk(client, operations)

response = client.indices.get_mapping(index=index_name)

with open("answers/q2.json", "w") as f:
    json.dump(dict(response), f, indent=2)

In [7]:
#q3
mapping_update = {
    "properties": {
        "wiki": {
            "type": "text"
        }
    }
}


client.indices.put_mapping(index=index_name, body=mapping_update)


txt_files_dir = "data/text"


txt_files = [f for f in os.listdir(txt_files_dir) if f.endswith(".txt")]

# Bulk upload text files
operations = []
for txt_file in txt_files:
    file_path = os.path.join(txt_files_dir, txt_file)
    with open(file_path, "r") as f:
        data = f.read()
        operations.append({
            "_index": index_name,
            "_source": {
                "wiki": data
            }
        })


helpers.bulk(client, operations)

mapping_response = client.indices.get_mapping(index=index_name)


with open("answers/q3.json", "w") as f:
    json.dump(dict(mapping_response), f, indent=2)


In [8]:
#q4
index_name = "madmap"

query = {
    "query": {
        "match": {
            "formattedAddress": "University"  
        }
    },
    "size": 10000  
}

response = client.search(index=index_name, body=query)
with open("answers/q4.json", "w") as f:
    json.dump(dict(response), f, indent=4)

In [9]:
#q5
index_name = "madmap"

query_q5 = {
    "query": {
        "fuzzy": {
            "title": {
                "value": "Madson",
                "fuzziness": "AUTO"
            }
        }
    },
    "_source": ["title"],
    "size": 10000
}

response_q5 = client.search(index=index_name, body=query_q5)
with open("answers/q5.json", "w") as f:
    json.dump(dict(response_q5), f, indent=4)

In [10]:
#q6
index_name = "madmap"

query_q6 = {
    "query": {
        "bool": {
            "should": [
                {"match_phrase": {"title": "Wisconsin Badgers"}},
                {"match_phrase": {"description": "Wisconsin Badgers"}},
                {"match_phrase": {"content": "Wisconsin Badgers"}}
            ],
            "minimum_should_match": 1
        }
    },
    "size": 10000
}

response_q6 = client.search(index=index_name, body=query_q6)
with open("answers/q6.json", "w") as f:
    json.dump(dict(response_q6), f, indent=4)

In [11]:
#q7
index_name = "madmap"

query_q7 = {
    "query": {
        "bool": {
            "must": [
                {"exists": {"field": "formattedAddress"}}
            ],
            "must_not": [
                {"match_phrase": {"formattedAddress": "Madison"}}
            ]
        }
    },
    "_source": ["name", "formattedAddress"],
    "size": 10000
}

response_q7 = client.search(index=index_name, body=query_q7)
with open("answers/q7.json", "w") as f:
    json.dump(dict(response_q7), f, indent=4)

In [12]:
#q8
index_name = "madmap"

query_q8 = {
    "query": {
        "simple_query_string": {
            "query": "rivalry^3 football^2 badgers",
            "fields": ["wiki"],
            "default_operator": "AND"
        }
    },
    "_source": ["wiki"],
    "size": 10000
}

response_q8 = client.search(index=index_name, body=query_q8)
with open("answers/q8.json", "w") as f:
    json.dump(dict(response_q8), f, indent=4)


In [13]:
#q9
index_name = "madmap"

query_q9 = {
    "query": {
        "match_phrase": {
            "wiki": "rivalry"
        }
    },
    "_source": ["wiki"],
    "highlight": {
        "fields": {
            "wiki": {}
        }
    }
}

response_q9 = client.search(index=index_name, body=query_q9)
top_hit = response_q9['hits']['hits'][0]['highlight']

with open("answers/q9.json", "w") as f:
    json.dump(dict(top_hit), f, indent=4)

In [14]:
#q10
index_name = "madmap"

query_q10 = {
    "query": {
        "bool": {
            "must": [
                {"term": {"source.name.keyword": "Nasa"}},
                {"term": {"_index": "news_madison"}}  
            ]
        }
    },
    "_source": ["title", "source.name", "publishedAt"],
    "size": 10000
}

response_q10 = client.search(index=index_name, body=query_q10)
with open("answers/q10.json", "w") as f:
    json.dump(dict(response_q10), f, indent=4)

In [15]:
#q11
index_name = "madmap"

query_q11 = {
    "size": 0,
    "aggs": {
        "total_arrests": {
            "sum": {
                "field": "arrests"
            }
        }
    }
}
response_q11 = client.search(index=index_name, body=query_q11)
answer = float(response_q11["aggregations"]["total_arrests"]["value"])
total_arrests_sum = answer
with open("answers/q11.json", "w") as json_file:
    json.dump(total_arrests_sum, json_file, indent=4)


In [16]:
#q12
index_name = "madmap"

query_q12 = {
  "size": 0,
  "aggs": {
    "source_count": {
      "terms": {
        "field": "source.name.keyword",
        "size": 10,
        "order": {
          "_count": "desc"
        }
      }
    }
  }
}

response_q12 = client.search(index=index_name, body=query_q12)

source_count = response_q12["aggregations"]["source_count"]["buckets"]

with open("answers/q12.json", "w") as json_file:
    json.dump(source_count, json_file, indent=4)



In [17]:
#q13
index_name = "madmap"

query_q13 = {
    "size": 0,
    "aggs": {
        "location_name_count": {
            "value_count": {
                "field": "name.keyword"  
            }
        }
    }
}


response_q13 = client.search(index=index_name, body=query_q13)

location_name_count = response_q13["aggregations"]["location_name_count"]["value"]

with open("answers/q13.json", "w") as json_file:
    json.dump(location_name_count, json_file, indent=4)


In [18]:
#q14
index_name = "madmap"

query_q14 = {
    "size": 0,
    "aggs": {
        "unique_authors": {
            "cardinality": {
                "field": "author.keyword"  
            }
        }
    }
}


response_q14 = client.search(index=index_name, body=query_q14)


unique_authors = response_q14["aggregations"]["unique_authors"]["value"]

with open("answers/q14.json", "w") as json_file:
    json.dump(unique_authors, json_file, indent=4)


In [19]:
#q15
index_name = "madmap"

query_q15 = {
    "size": 0,
    "aggs": {
        "avg_attended": {
            "avg": {
                "field": "attended" 
            }
        }
    }
}


response_q15 = client.search(index=index_name, body=query_q15)

avg_attended = response_q15["aggregations"]["avg_attended"]["value"]

with open("answers/q15.json", "w") as json_file:
    json.dump(avg_attended, json_file, indent=4)
