In [10]:
from pymongo import MongoClient
import pymongo
import csv
import pandas as pd
import os # For handling file paths
from parameters import db_name, connection_link, collection_issues, data_out_folder

In [11]:
#!pip install pymongo
print(f"Using MongoDB URL: {connection_link}")
print(f"Output data folder: {data_out_folder}")

Using MongoDB URL: mongodb://localhost:27017/
Output data folder: C:\Users\hp\Desktop\Module-3-Task-assigning\data\data_output


In [12]:
try:
    client = pymongo.MongoClient(connection_link)
    # Ensure the client is connected, or raise an error early
    client.admin.command('ping') # Pings the server to check connection
    print("Successfully connected to MongoDB.")
except pymongo.errors.ConnectionFailure as e:
    print(f"Could not connect to MongoDB: {e}")
    exit() # Exit if connection fails

Successfully connected to MongoDB.


In [13]:
db = client[db_name]
collection = db[collection_issues]
print(f"Accessing database: '{db_name}', collection: '{collection_issues}'")

Accessing database: 'apache_jira_data', collection: 'issues'


In [14]:
mongo_filter = {
    '$and': [
        {'assignee': {'$exists': True, '$not': {'$size': 0}}},
        {'labels': {'$exists': True, '$not': {'$size': 0}}},
        {'priority.id': {'$exists': True, '$not': {'$size': 0}}}, # '$size' is unusual here if 'priority.id' is not an array.
                                                                # Consider {'priority.id': {'$exists': True, '$ne': None}} if it's a single value.
        {'issuetype.id': {'$exists': True, '$not': {'$size': 0}}}  # Same as above for 'issuetype.id'.
    ]
}

mongo_projection = {
    '_id': 1, # Keeping _id is good practice
    'assignee': 1,
    'summary': 1,
    'description': 1,
    'issuetype': 1,
    'labels': 1,
    'priority': 1,
    'status': 1,
    'projectname': 1
}

In [15]:
print("Fetching issues from MongoDB with the provided filter and projection...")
try:
    issues_cursor = collection.find(filter=mongo_filter, projection=mongo_projection)
    list_issues = list(issues_cursor)  # Loads all matching documents into memory
    
    print(f"Found {len(list_issues)} issues matching the criteria in '{db_name}.{collection_issues}'.")

    if not list_issues:
        print("No issues found.")

except Exception as e:
    print(f"Error fetching data from MongoDB: {e}")
    # client.close() # Consider closing connection on error
    # exit()

Fetching issues from MongoDB with the provided filter and projection...
Found 144508 issues matching the criteria in 'apache_jira_data.issues'.


In [16]:
#poolars

In [17]:
# size = len(list_issues)
# limit = 1000
# i = 0
# for issue in list_issues:
#     if i % 1000 == 0:
#         print(f"Processing issue {i} of {size}")
#     i += 1    
#     issue["type_id"] = issue["issuetype"]["id"]
#     issue["priority_id"] = issue["priority"]["id"]
#     issue["status_id"] = issue["status"]["id"]
#     issue["type_name"] = issue["issuetype"]["name"]
#     issue["priority_name"] = issue["priority"]["name"]
#     issue["status_name"] = issue["status"]["name"]
#     if i == limit:
#         print(f"Processed {i} issues, stopping at limit.")
#         break

# # Convert data to dataframe and save to csv file
# datapath = os.path.join(data_out_folder, "issues.csv")
# mongo_df = pd.DataFrame(list_issues, columns=['_id', 'projectname', 'assignee', 'summary', 'description', 'type_id',
#                   'type_name', 'labels', 'priority_id', 'priority_name', 'status_id', 'status_name'])
# mongo_df.to_csv(datapath, sep='\t', encoding='utf-8')

In [18]:
size = len(list_issues)
limit = None
i = 0
print(f"Starting processing. Will process up to {limit} issues out of {size} total.")
processed_issues_for_df = []

for item_index, original_issue_data in enumerate(list_issues):
    if i % 1000 == 0:
        print(f"Processing original issue {item_index+1} (to become processed item {i+1})...")

    processed_issue = {}
    raw_id = original_issue_data.get('_id')
    if raw_id is not None:
        processed_issue['_id'] = str(raw_id)
    else:
        processed_issue['_id'] = None

    processed_issue['projectname'] = original_issue_data.get('projectname')
    processed_issue['summary'] = original_issue_data.get('summary')
    processed_issue['description'] = original_issue_data.get('description')

    issuetype_data = original_issue_data.get("issuetype")
    if isinstance(issuetype_data, dict):
        processed_issue["type_id"] = issuetype_data.get("id")
        processed_issue["type_name"] = issuetype_data.get("name")
    else:
        processed_issue["type_id"] = None
        processed_issue["type_name"] = None

    priority_data = original_issue_data.get("priority")
    if isinstance(priority_data, dict):
        processed_issue["priority_id"] = priority_data.get("id")
        processed_issue["priority_name"] = priority_data.get("name")
    else:
        processed_issue["priority_id"] = None
        processed_issue["priority_name"] = None

    status_data = original_issue_data.get("status")
    if isinstance(status_data, dict):
        processed_issue["status_id"] = status_data.get("id")
        processed_issue["status_name"] = status_data.get("name")
    else:
        processed_issue["status_id"] = None
        processed_issue["status_name"] = None

    assignee_data = original_issue_data.get('assignee')
    if isinstance(assignee_data, str):
        processed_issue['assignee'] = assignee_data
    elif isinstance(assignee_data, dict):
        processed_issue['assignee'] = assignee_data.get('displayName', assignee_data.get('name'))
    elif isinstance(assignee_data, list) and assignee_data: 
        first_assignee = assignee_data[0]
        if isinstance(first_assignee, dict):
            processed_issue['assignee'] = first_assignee.get('displayName', first_assignee.get('name'))
        elif isinstance(first_assignee, str):
            processed_issue['assignee'] = first_assignee
        else: 
            processed_issue['assignee'] = None
    else:
        processed_issue['assignee'] = None
 
    labels_data = original_issue_data.get('labels', [])
    processed_issue["labels"] = ", ".join(map(str, labels_data)) if labels_data else None

    processed_issues_for_df.append(processed_issue)
    i += 1
    if limit and i >= limit:
        print(f"Processed {i} issues, stopping at limit.")
        break

print(f"Finished processing loop. {i} issues were processed and prepared for DataFrame.")

df_columns = [
    '_id', 'projectname', 'assignee', 'summary', 'description', 'type_id',
    'type_name', 'labels', 'priority_id', 'priority_name', 'status_id', 'status_name'
]

if not data_out_folder or not os.path.isdir(data_out_folder):
    print(f"Output directory '{data_out_folder}' is not defined or does not exist. Please create it or set the 'data_out_folder' variable.")
else:
    datapath = os.path.join(data_out_folder, "issues_limited_user_cols.csv")
    mongo_df = pd.DataFrame(processed_issues_for_df)
    
    for col in df_columns:
        if col not in mongo_df.columns:
            mongo_df[col] = None 
    mongo_df = mongo_df[df_columns] 

    print(f"Saving {len(mongo_df)} issues to CSV: {datapath}")
    try:
        mongo_df.to_csv(datapath, sep='\t', encoding='utf-8', index=False)
        print(f"Successfully saved data to {datapath}")
    except Exception as e:
        print(f"Error saving CSV file: {e}")

Starting processing. Will process up to None issues out of 144508 total.
Processing original issue 1 (to become processed item 1)...
Processing original issue 1001 (to become processed item 1001)...
Processing original issue 2001 (to become processed item 2001)...
Processing original issue 3001 (to become processed item 3001)...
Processing original issue 4001 (to become processed item 4001)...
Processing original issue 5001 (to become processed item 5001)...
Processing original issue 6001 (to become processed item 6001)...
Processing original issue 7001 (to become processed item 7001)...
Processing original issue 8001 (to become processed item 8001)...
Processing original issue 9001 (to become processed item 9001)...
Processing original issue 10001 (to become processed item 10001)...
Processing original issue 11001 (to become processed item 11001)...
Processing original issue 12001 (to become processed item 12001)...
Processing original issue 13001 (to become processed item 13001)...
P