## Establishing Connection


In [None]:
from arango import ArangoClient

ArangoClient = ArangoClient(hosts="http://127.0.0.1:8530")

db = ArangoClient.db("yelp_db", username="super", password="grantaccess")

collections = db.collections()


## Preprocess Json data for easy loading

In [None]:
import json

def preprocess_json(input_file, output_file):
    with open(input_file, 'r') as f:
        # Read the entire file content
        data = f.read()

        # Split the content by newline character to handle multiple JSON objects
        json_objects = data.strip().split('\n')

    # Process each JSON object separately
    processed_data = []
    for json_str in json_objects:
        try:
            # Load each JSON object separately
            obj = json.loads(json_str)
            processed_data.append(obj)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)

    # Write processed data to the output file
    with open(output_file, 'w') as f:
        json.dump(processed_data, f)

if __name__ == "__main__":
    input_file = "yelp_academic_dataset_user.json"  # Replace with the path to your input JSON file
    output_file = "yelp_academic_dataset_user_updated.json"  # Replace with the path to your output JSON file
    preprocess_json(input_file, output_file)


In [None]:
import json
collection_name = "user"
collection=db.collection(collection_name)
# Path to the JSON file
json_file = "yelp_academic_dataset_user_updated.json"  

# Read the JSON data from file
with open(json_file, 'r') as f:
    data = json.load(f)

# Bulk insert the data into the collection
collection.import_bulk(data)

## Defining Edge collection based on uploaded Collection

In [None]:
from arango import ArangoClient

# Connect to ArangoDB
client = ArangoClient(hosts='http://127.0.0.1:8530')
db = client.db('yelp_db', username='root', password='')



In [None]:
tipscollection=db.collection('tip')

cursor = tipscollection.all()

In [None]:
# Your function to retrieve records and create edge documents
def process_records(cursor):
    edge_documents = []
    for record in cursor:
        # Assuming each record is a dictionary
        # Generate edge document for business
        edge_documents.append({
            "_from": f"tips/{record['_key']}",  # Assuming '_key' is the document key
            "_to": f"business/{record['business_id']}"
        })
    return edge_documents

# Process the cursor to generate edge documents
edge_documents = process_records(cursor)

In [None]:
print(len(edge_documents))

In [None]:
# Function to insert records into an edge collection in batches
def insert_edges(collection_name, edge_documents, batch_size=1000):
    for i in range(0, len(edge_documents), batch_size):
        batch = edge_documents[i:i+batch_size]
        db[collection_name].import_bulk(batch)

# Split the edge documents into batches and insert them into the collection
# insert_edges('tip_business', edge_documents)


In [None]:
cursor = tipscollection.all()

In [None]:
def process_records_user(cursor):
    edge_documents = []
    for record in cursor:
        edge_documents.append({
            "_from": f"tips/{record['_key']}",  # Assuming '_key' is the document key
            "_to": f"user/{record['user_id']}"
        })
    return edge_documents


edge_documents_user = process_records_user(cursor)

In [None]:
print(len(edge_documents_user))

In [None]:
insert_edges('tip_user', edge_documents_user)

In [None]:
checkinscollection=db.collection('checkin')
checkin_records = checkinscollection.all()

In [None]:
print(len(checkin_records))

In [None]:
def process_records_checkin_business(cursor):
    edge_documents = []
    for record in cursor:
        edge_documents.append({
            "_from": f"checkin/{record['_key']}",  # Assuming '_key' is the document key
            "_to": f"business/{record['business_id']}"
        })
    return edge_documents

In [None]:
edge_documents_checkin = process_records_checkin_business(checkin_records)

In [None]:
insert_edges('checkin_business', edge_documents_checkin)

In [None]:
def process_records_review_business(cursor):
    edge_documents = []
    for record in cursor:
        edge_documents.append({
            "_from": f"review/{record['_key']}",  # Assuming '_key' is the document key
            "_to": f"business/{record['business_id']}"
        })
    return edge_documents
def process_records_review_user(cursor):
    edge_documents = []
    for record in cursor:
        edge_documents.append({
            "_from": f"review/{record['_key']}",  # Assuming '_key' is the document key
            "_to": f"user/{record['user_id']}"
        })
    return edge_documents

In [None]:
reviewsCollection=db.collection('review')

review_records = reviewsCollection.all()


In [None]:
print(len(review_records))

In [None]:
type(review_records)

In [None]:
edge_documents_review_business = process_records_review_business(review_records)

In [None]:
insert_edges('review_business', edge_documents_review_business)

In [None]:
edge_documents_review_user = process_records_review_user(review_records)

In [None]:
insert_edges('review_user', edge_documents_review_user)

## Analysis on Data


In [2]:
from arango import ArangoClient

ArangoClient = ArangoClient(hosts="http://127.0.0.1:8530")

db = ArangoClient.db("yelp_db", username="super", password="grantaccess")

In [10]:
#fetch and plot different categories of businesses

categories=db.aql.execute("FOR b IN business RETURN b.categories")

import matplotlib.pyplot as plt
import pandas as pd
categories_df = pd.DataFrame(categories)
categories_df.head()   

print(categories_df.columns)

RangeIndex(start=0, stop=1, step=1)


In [9]:
import pandas as pd
import matplotlib.pyplot as plt

# Assuming the DataFrame is already created and named 'categories_df'

# Split the comma-separated categories and create a list of categories
categories_df['category_list'] = categories_df.apply(lambda x: x.split(','))

# Create a new DataFrame to store individual categories
categories_list = [category for sublist in categories_df['category_list'] for category in sublist]
categories_count_df = pd.DataFrame(categories_list, columns=['category'])

# Count the occurrences of each category
category_counts = categories_count_df['category'].value_counts().reset_index()
category_counts.columns = ['category', 'count']

# Sort the categories by count in descending order
category_counts = category_counts.sort_values(by='count', ascending=False)

# Plot the top N categories (adjust 'N' as needed)
top_n = 10
top_categories = category_counts.head(top_n)

# Plot the bar chart
plt.figure(figsize=(10, 6))
plt.bar(top_categories['category'], top_categories['count'], color='skyblue')
plt.xlabel('Category')
plt.ylabel('Number of Businesses')
plt.title('Top {} Business Categories'.format(top_n))
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


AttributeError: 'Series' object has no attribute 'split'