In [4]:
import numpy as np 
import pandas as pd
import random
import string
import time

In [5]:


def generate_support_tickets(num_tickets=200):
    # --- Configuration ---
    
    # Categories with associated weights (e.g., 'Login Issue' happens most often)
    categories = ['Login Issue', 'Billing Inquiry', 'System Crash', 'Feature Request']
    cat_weights = [45, 30, 15, 10] 
    
    priorities = ['Low', 'Medium', 'High', 'Critical']
    statuses = ['Open', 'In Progress', 'Resolved', 'Closed']
    
    tickets = []

    # --- Generation Logic ---
    for i in range(1, num_tickets + 1):
        
        # 1. Weighted Category Selection
        # random.choices returns a list, so we take the [0] index
        category = random.choices(categories, weights=cat_weights, k=1)[0]
        
        # 2. Priority Logic (Optional: make critical issues rarer)
        priority = random.choices(priorities, weights=[40, 40, 15, 5], k=1)[0]
        
        # 3. Dirty Data Simulation for 'resolution_minutes'
        # We generate a random float between 0 and 1 to decide the data quality
        risk_roll = random.random()
        
        if risk_roll < 0.05:
            # 5% chance of missing data
            resolution_min = None 
        elif risk_roll < 0.10:
            # 5% chance of malformed data (string instead of int)
            resolution_min = "TBD" 
        else:
            # 90% chance of valid data (integer)
            # Resolution time varies roughly by priority
            if priority == 'Critical':
                resolution_min = random.randint(60, 480)
            else:
                resolution_min = random.randint(5, 120)

        # 4. Construct the Dictionary
        ticket = {
            "ticket_id": f"TKT-{1000 + i}",
            "category": category,
            "priority": priority,
            "status": random.choice(statuses),
            "resolution_minutes": resolution_min
        }
        
        tickets.append(ticket)
        
    return tickets

# --- Execution ---

# Generate the list
ticket_data = generate_support_tickets(250)

# --- Validation ---

print(f"Total Tickets Generated: {len(ticket_data)}\n")
print("--- First 5 Entries ---")
for t in ticket_data[:5]:
    print(t)

Total Tickets Generated: 250

--- First 5 Entries ---
{'ticket_id': 'TKT-1001', 'category': 'Billing Inquiry', 'priority': 'Medium', 'status': 'In Progress', 'resolution_minutes': 72}
{'ticket_id': 'TKT-1002', 'category': 'Login Issue', 'priority': 'Low', 'status': 'In Progress', 'resolution_minutes': 55}
{'ticket_id': 'TKT-1003', 'category': 'System Crash', 'priority': 'Low', 'status': 'Open', 'resolution_minutes': 106}
{'ticket_id': 'TKT-1004', 'category': 'Login Issue', 'priority': 'Medium', 'status': 'Resolved', 'resolution_minutes': 82}
{'ticket_id': 'TKT-1005', 'category': 'Login Issue', 'priority': 'High', 'status': 'In Progress', 'resolution_minutes': 51}


In [6]:
def validate_keys(tickets, required_keys):
    """
    Checks if all tickets contain the specified keys.
    Returns a list of indices where keys are missing.
    """
    missing_key_indices = []
    
    for index, ticket in enumerate(tickets):
        # Check if any required key is not in the ticket's keys
        # We use set logic for efficiency: is the required set a subset of ticket keys?
        if not set(required_keys).issubset(ticket.keys()):
            missing_key_indices.append(index)
            
    return missing_key_indices

def find_invalid_resolutions(tickets):
    """
    Identifies tickets with non-integer resolution_minutes.
    Returns a list of dictionaries containing the index and the invalid value.
    """
    invalid_records = []
    
    for index, ticket in enumerate(tickets):
        val = ticket.get('resolution_minutes')
        
        # Logic: Valid if it IS an integer.
        # Invalid if it is None OR not an instance of int.
        # Note: boolean True/False are technically ints in Python, so we strictly check type(val) is int
        if val is None or not isinstance(val, int):
            invalid_records.append({
                "index": index,
                "ticket_id": ticket.get('ticket_id', 'UNKNOWN'),
                "invalid_value": val,
                "issue_type": "Missing" if val is None else "Wrong Type"
            })
            
    return invalid_records

# --- Execution & Explanation ---

# 1. Define what keys we expect every ticket to have
expected_keys = {'ticket_id', 'category', 'priority', 'status', 'resolution_minutes'}

# 2. Run Key Validation
missing_indices = validate_keys(ticket_data, expected_keys)

# 3. Run Data Validation
bad_data_report = find_invalid_resolutions(ticket_data)

# --- Validating the Output (Printing results outside the functions) ---

print(f"--- validation Report ---")
print(f"Records missing keys: {len(missing_indices)}")

print(f"\nRecords with invalid resolution times: {len(bad_data_report)}")
if bad_data_report:
    print(f"Sample of invalid entries (First 3):")
    for issue in bad_data_report[:3]:
        print(issue)

--- validation Report ---
Records missing keys: 0

Records with invalid resolution times: 28
Sample of invalid entries (First 3):
{'index': 8, 'ticket_id': 'TKT-1009', 'invalid_value': 'TBD', 'issue_type': 'Wrong Type'}
{'index': 11, 'ticket_id': 'TKT-1012', 'invalid_value': 'TBD', 'issue_type': 'Wrong Type'}
{'index': 19, 'ticket_id': 'TKT-1020', 'invalid_value': 'TBD', 'issue_type': 'Wrong Type'}


In [8]:
#task 3
def clean_ticket_data(raw_tickets):
    """
    Creates a new list of cleaned ticket dictionaries.
    - Normalizes strings (trim, title case).
    - Repairs 'None' resolution times (sets to 0).
    - Drops records with malformed resolution times (strings like 'TBD').
    """
    cleaned_list = []
    
    for ticket in raw_tickets:
        # 1. Create a COPY to avoid mutating the original raw data
        # This is crucial for comparing 'before' vs 'after' later
        clean_ticket = ticket.copy()
        
        # 2. Normalize Category (String Manipulation)
        # Fixes cases like " login Issue " -> "Login Issue"
        if 'category' in clean_ticket:
            clean_ticket['category'] = str(clean_ticket['category']).strip().title()
            
        # 3. Handle Resolution Minutes (Logic)
        res_time = clean_ticket.get('resolution_minutes')
        
        # LOGIC: Check types to decide fate of record
        if res_time is None:
            # REPAIR: Missing values become 0
            clean_ticket['resolution_minutes'] = 0
            cleaned_list.append(clean_ticket)
            
        elif isinstance(res_time, int):
            # KEEP: Valid data stays as is
            cleaned_list.append(clean_ticket)
            
        else:
            # DROP: Malformed strings (e.g., "TBD") are skipped
            continue
            
    return cleaned_list

# --- Execution & Validation ---
# Assuming 'ticket_data' exists from Task 1
cleaned_tickets = clean_ticket_data(ticket_data)

print(f"Original Count: {len(ticket_data)}")
print(f"Cleaned Count:  {len(cleaned_tickets)}")
print(f"Dropped Records: {len(ticket_data) - len(cleaned_tickets)}")

print("\n--- Sample Cleaned Record ---")
print(cleaned_tickets[0])

Original Count: 250
Cleaned Count:  229
Dropped Records: 21

--- Sample Cleaned Record ---
{'ticket_id': 'TKT-1001', 'category': 'Billing Inquiry', 'priority': 'Medium', 'status': 'In Progress', 'resolution_minutes': 72}


In [10]:
#task 4
def get_average_resolution_by_category(tickets):
    """
    Calculates average resolution time per category.
    Returns a dict: { 'Category': Average_Time }
    """
    # Helper dict to store sums: {'Login Issue': [total_time, count]}
    temp_stats = {}
    
    for t in tickets:
        cat = t['category']
        mins = t['resolution_minutes']
        
        if cat not in temp_stats:
            temp_stats[cat] = {'total': 0, 'count': 0}
            
        temp_stats[cat]['total'] += mins
        temp_stats[cat]['count'] += 1
    
    # Compute final averages
    averages = {}
    for cat, data in temp_stats.items():
        if data['count'] > 0:
            averages[cat] = round(data['total'] / data['count'], 2)
        else:
            averages[cat] = 0
            
    return averages

def get_escalation_rates(tickets):
    """
    Calculates the % of tickets marked as 'Critical' priority.
    Returns a dict with global rate and per-category rates.
    """
    category_counts = {} 
    total_tickets = 0
    total_critical = 0
    
    for t in tickets:
        cat = t['category']
        # Convert boolean check to 1 or 0 for easy summing
        is_critical = 1 if t['priority'] == 'Critical' else 0
        
        # Update Global
        total_tickets += 1
        total_critical += is_critical
        
        # Update Category
        if cat not in category_counts:
            category_counts[cat] = {'total': 0, 'critical': 0}
        
        category_counts[cat]['total'] += 1
        category_counts[cat]['critical'] += is_critical

    # Compute Rates
    rates = {
        'overall_rate': 0,
        'by_category': {}
    }
    
    if total_tickets > 0:
        rates['overall_rate'] = round((total_critical / total_tickets) * 100, 2)
    
    for cat, data in category_counts.items():
        if data['total'] > 0:
            pct = (data['critical'] / data['total']) * 100
            rates['by_category'][cat] = round(pct, 2)
        
    return rates

def validate_summaries(cleaned_data):
    """
    Simple sanity check to print the count.
    """
    print(f"[Validation Check] Processing {len(cleaned_data)} valid records.")
    return True

# --- Execution ---
validate_summaries(cleaned_tickets)
avg_times = get_average_resolution_by_category(cleaned_tickets)
esc_rates = get_escalation_rates(cleaned_tickets)

print("\n--- Averages Sample ---")
print(str(avg_times)[:100] + "...") # Print first 100 chars to keep it clean

[Validation Check] Processing 229 valid records.

--- Averages Sample ---
{'Billing Inquiry': 73.22, 'Login Issue': 69.69, 'System Crash': 67.14, 'Feature Request': 53.13}...


In [11]:
#task 5
import json

def generate_final_report(tickets):
    """
    Combines all summaries into one report dictionary.
    """
    report = {
        "meta": {
            "total_records": len(tickets),
            "status": "Success",
            "report_generated": "Standard"
        },
        "averages": get_average_resolution_by_category(tickets),
        "escalations": get_escalation_rates(tickets)
    }
    return report

# 1. Generate the report object
final_report = generate_final_report(cleaned_tickets)

# 2. Print nicely formatted JSON
print("--- FINAL REPORT ---")
print(json.dumps(final_report, indent=4))

# 3. Derive a textual insight
# Find the category with the highest resolution time
worst_category = max(final_report['averages'], key=final_report['averages'].get)
worst_time = final_report['averages'][worst_category]

print("\n--- ANALYST INSIGHT ---")
print(f"Observation: The category '{worst_category}' takes the longest to resolve "
      f"(average {worst_time} minutes). This correlates with our escalation data, "
      f"suggesting we need a dedicated tier-2 team for these specific issues.")

--- FINAL REPORT ---
{
    "meta": {
        "total_records": 229,
        "status": "Success",
        "report_generated": "Standard"
    },
    "averages": {
        "Billing Inquiry": 73.22,
        "Login Issue": 69.69,
        "System Crash": 67.14,
        "Feature Request": 53.13
    },
    "escalations": {
        "overall_rate": 6.99,
        "by_category": {
            "Billing Inquiry": 7.25,
            "Login Issue": 5.56,
            "System Crash": 10.81,
            "Feature Request": 6.67
        }
    }
}

--- ANALYST INSIGHT ---
Observation: The category 'Billing Inquiry' takes the longest to resolve (average 73.22 minutes). This correlates with our escalation data, suggesting we need a dedicated tier-2 team for these specific issues.
