In [250]:
import pandas as pd
import numpy as np
import re

df = pd.read_csv('/content/technician_performance_multi_region_messy.csv')

In [251]:
# ---------------------------------------------------------
# 1. Standardize NULL-like values
# ---------------------------------------------------------
null_values = {"", " ", "nan", "none", "n/a", "na", "unknown", "N/A", "None", "Unknown"}

def clean_null(x):
    if pd.isna(x):
        return np.nan
    x = str(x).strip()
    return np.nan if x in null_values else x

for col in df.columns:
    df[col] = df[col].apply(clean_null)

In [252]:
# ---------------------------------------------------------
# 2. Trim whitespace / remove extra spaces
# ---------------------------------------------------------
for col in df.columns:
    df[col] = df[col].astype(str).str.strip()

In [253]:
# ---------------------------------------------------------
# 3. Fix inconsistent values in each columns
# ---------------------------------------------------------
# Normalizing tech names
def normalize_tech(name):
    if pd.isna(name):
        return np.nan
    name = re.sub(r"[^\w\s]", "", name)  # remove punctuation
    name = name.strip().title()

    # Map variations to standardized names
    mapping = {
        'John S.':'John Smith',
        'J Smith':'John Smith',
        'John S':'John Smith',
        'J. Smith':'John Smith',
        'Jess T.':'Jessica Taylor',
        'J. Taylor':'Jessica Taylor',
        'J Taylor':'Jessica Taylor',
        'Jess T':'Jessica Taylor',
        'Jessica T':'Jessica Taylor',
        'Maria Gomez':'Maria Gomez',
        'Maria G':'Maria Gomez',
        'Mari Gomez':'Maria Gomez',
        'M Gomez':'Maria Gomez',
        'M. Gomez':'Maria Gomez',
        'Liam P.':'Liam Peterson',
        'Liam P':'Liam Peterson',
        'L P':'Liam Peterson'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in name.lower().replace(" ", ""):
            return mapping[key]

    return name

df["tech_name"] = df["tech_name"].apply(normalize_tech)
#df

# Normalizing city names
def normalize_city(city):
    if pd.isna(city):
        return np.nan
    city = re.sub(r"[^\w\s]", "", city)  # remove punctuation
    city = city.strip().title()

    # Map variations to city names
    mapping = {
        'concord':'Concord',
        'charlotte':'Charlotte',
        'huntersvill':'Huntersville'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in city.lower().replace(" ", ""):
            return mapping[key]

    return city

df["city"] = df["city"].apply(normalize_city)
#df

# Normalizing region names
def normalize_region(region):
    if pd.isna(region):
        return np.nan
    region = re.sub(r"[^\w\s]", "", region)  # remove punctuation
    region = region.strip().title()

    # Map variations to region names
    mapping = {
        'east':'east',
        'East':'east',
        'Suth':'south',
        'South':'south',
        'WEST':'west',
        'West':'west',
        'Noth':'north',
        'North':'north'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in region.lower().replace(" ", ""):
            return mapping[key]

    return city

df["region"] = df["region"].apply(normalize_region)
#df

# Normalizing job_type names
def normalize_job(job_type):
    if pd.isna(job_type):
        return np.nan
    job_type = re.sub(r"[^\w\s]", "", job_type)  # remove punctuation
    job_type = job_type.strip().title()

    # Map variations for job_type
    mapping = {
        'install':'install',
        'reapir':'repair',
        'Repair':'repair',
        'Instal':'install',
        'Install':'install',
        'Maintenance':'maintenence',
        'Maint.':'maintenence',
        'Maint':'maintenence'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in job_type.lower().replace(" ", ""):
            return mapping[key]

    return job_type

df["job_type"] = df["job_type"].apply(normalize_job)
#df

# Normalizing scheduled_time
def normalize_time(scheduled_time):
    if pd.isna(scheduled_time):
        return np.nan
    scheduled_time = re.sub(r"[^\w\s]", "", scheduled_time)  # remove punctuation
    scheduled_time = scheduled_time.strip().title()

    # Map variations for scheduled_time
    mapping = {
        '14:22':'2:22pm',
        '12:3 pm':'12:30pm',
        '07:50':'7:50am',
        '09:5 AM':'9:50am',
        '16:40':'4:40pm',
        '8:15 AM':'8:15am',
        '1:15 pm':'1:15pm',
        '900Am':'9:00am',
        '1422':'2:22pm',
        '123 pm':'12:30pm',
        '750Am':'7:50am',
        '0750':'7:50am',
        '095 AM':'9:50am',
        '1640':'4:40pm',
        '815 AM':'8:15am',
        '1115 pm':'1:15pm',
        '900Am':'9:00am',
        '115 Pm':'1:15pm',
        '1230':'12:30pm',
        '222Pm':'2:22pm',
        '0950Am':'9:50am',
        '440Pm':'4:40pm'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in scheduled_time.lower().replace(" ", ""):
            return mapping[key]

    return scheduled_time

df["scheduled_time"] = df["scheduled_time"].apply(normalize_time)
#df

# Normalizing arrival_time
def normalize_time(arrival_time):
    if pd.isna(arrival_time):
        return np.nan
    arrival_time = re.sub(r"[^\w\s]", "", arrival_time)  # remove punctuation
    arrival_time = arrival_time.strip().title()

    # Map variations to arrival_time
    mapping = {
        '14:22':'2:22pm',
        '12:3 pm':'12:30pm',
        '07:50':'7:50am',
        '09:5 AM':'9:50am',
        '16:40':'4:40pm',
        '8:15 AM':'8:15am',
        '1:15 pm':'1:15pm',
        '900Am':'9:00am',
        '1422':'2:22pm',
        '123 pm':'12:30pm',
        '0750':'7:50am',
        '095 AM':'9:50am',
        '1640':'4:40pm',
        '815 AM':'8:15am',
        '1115 pm':'1:15pm',
        '900Am':'9:00am',
        '115 Pm':'1:15pm',
        '1230':'12:30pm',
        '222Pm':'2:22pm',
        '0950Am':'9:50am',
        '440Pm':'4:40pm',
        '950Am':'9:50am',
        '750Am':'7:50am'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in arrival_time.lower().replace(" ", ""):
            return mapping[key]

    return arrival_time

df["arrival_time"] = df["arrival_time"].apply(normalize_time)
#df

# Normalizing success column
def normalize_success(success):
    if pd.isna(success):
        return np.nan
    success = re.sub(r"[^\w\s]", "", success)  # remove punctuation
    success = success.strip().title()

    # Map variations to arrival_time
    mapping = {
        'No':'No',
        'no':'No',
        'Yes':'Yes',
        'yes':'Yes',
        'Y':'Yes',
        'N':'No'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in success.lower().replace(" ", ""):
            return mapping[key]

    return success

df["success"] = df["success"].apply(normalize_success)

# Normalizing repeat_visit column
def normalize_repeat_visit(repeat_visit):
    if pd.isna(repeat_visit):
        return np.nan
    repeat_visit = re.sub(r"[^\w\s]", "", repeat_visit)  # remove punctuation
    repeat_visit = repeat_visit.strip().title()

    # Map variations to repeat_visit
    mapping = {
        'No':'No',
        'no':'No',
        'Yes':'Yes',
        'yes':'Yes',
        'Y':'Yes',
        'N':'No'

    }

    for key in mapping:
        if key.lower().replace(" ", "") in repeat_visit.lower().replace(" ", ""):
            return mapping[key]

    return repeat_visit

df["repeat_visit"] = df["repeat_visit"].apply(normalize_repeat_visit)

In [254]:
# ---------------------------------------------------------
# 4. Fix date formats (including invalid ones)
# ---------------------------------------------------------
def parse_date(val):
    if pd.isna(val):
        return np.nan

    try:
        return pd.to_datetime(val, errors="raise", dayfirst=False)
    except:
        pass

    # Attempt to fix invalid date like "2024-13-10"
    match = re.match(r"(\d{4})-(\d{2})-(\d{2})", str(val))
    if match:
        y, m, d = match.groups()
        if int(m) > 12 and int(d) <= 12:
            return pd.to_datetime(f"{y}-{d}-{m}", errors="coerce")

    return pd.to_datetime(val, errors="coerce")

df["scheduled_date"] = df["scheduled_date"].apply(parse_date)
#df

In [255]:
# ---------------------------------------------------------
# 5. Parse and standardize time columns
# ---------------------------------------------------------
def parse_time(t):
    if pd.isna(t):
        return np.nan
    return pd.to_datetime(t, errors="coerce").time()

df["scheduled_time_dt"] = pd.to_datetime(df["scheduled_time"], errors="coerce")
df["arrival_time_dt"] = pd.to_datetime(df["arrival_time"], errors="coerce")


  df["scheduled_time_dt"] = pd.to_datetime(df["scheduled_time"], errors="coerce")
  df["arrival_time_dt"] = pd.to_datetime(df["arrival_time"], errors="coerce")


In [256]:
# ---------------------------------------------------------
# 6. Standardize city names
# ---------------------------------------------------------
df["city"] = df["city"].str.replace("  ", " ").str.title()

In [257]:
# ---------------------------------------------------------
# 7. Clean and standardize parts_used
# ---------------------------------------------------------

def clean_parts(p):
    if pd.isna(p):
        return []
    p = p.replace('"', "").replace("'", "")
    parts = re.split(r"[;,]", p)
    parts = [x.strip().title() for x in parts if x.strip() not in null_values]
    return parts

df["parts_list"] = df["parts_used"].apply(clean_parts)

In [258]:
#---------------------------------------------------
# 8. Last fixes
#---------------------------------------------------

# Dropping unneeded columns
df = df.drop(columns=["scheduled_time", "arrival_time","parts_used"])

# Map Yes → True, No → False
cols = ["success", "repeat_visit"]
df[cols] = df[cols].replace({"Yes": True, "No": False})

# Changing column to numeric
df['duration_minutes'] = df['duration_minutes'].astype(float)

# Fill null values with 0 in duration_minutes column
df["duration_minutes"] = df["duration_minutes"].fillna(0)

# Rename column
df.rename(columns={"parts_list": "parts_used"}, inplace=True)

# Adding technician name where name was show 'Nan'
df.loc[df['tech_name'] == 'Nan', 'tech_name'] = 'Sophia Martinez'



  df[cols] = df[cols].replace({"Yes": True, "No": False})


In [259]:
#-------------------------------------------------------------------------------------
# These next few blocks of code is help from ChatGpt to make my dataset more realistic
# ------------------------------------------------------------------------------------


import numpy as np

# Define target success rates
success_rates = {
    'install': 0.80,
    'maintenance': 0.90,
    'repair': 0.85
}

# Update success column probabilistically by job_type
for job_type, rate in success_rates.items():
    mask = df['job_type'] == job_type
    df.loc[mask, 'success'] = np.random.rand(mask.sum()) < rate


In [260]:
# Set repeat_visit based on success
df['repeat_visit'] = False
df.loc[df['success'] == False, 'repeat_visit'] = np.random.rand((df['success'] == False).sum()) < 0.7  # 60–80%
df.loc[df['success'] == True, 'repeat_visit'] = np.random.rand((df['success'] == True).sum()) < 0.10  # 5–15%

# Clear repeat_reason if no repeat visit
df.loc[df['repeat_visit'] == False, 'repeat_reason'] = np.nan


In [261]:
# Define realistic duration ranges by job_type
duration_ranges = {
    'install': (45, 180),
    'maintenance': (30, 90),
    'repair': (20, 120)
}

for job_type, (min_dur, max_dur) in duration_ranges.items():
    mask = df['job_type'] == job_type
    df.loc[mask, 'duration_minutes'] = np.random.randint(min_dur, max_dur+1, mask.sum())


In [262]:
techs = ['Jessica Taylor', 'John Smith', 'Nan', 'Liam Peterson', 'Maria Gomez']
df['tech_name'] = np.random.choice(techs, size=len(df))

# Assign performance tiers
performance = {
    'Jessica Taylor': {'success': 0.95, 'repeat_prob_fail': 0.3},  # Strong tech
    'John Smith': {'success': 0.60, 'repeat_prob_fail': 0.9},  # Weak tech
    'Nan': {'success': 0.85, 'repeat_prob_fail': 0.5},  # Average
    'Liam Peterson': {'success': 0.85, 'repeat_prob_fail': 0.5},
    'Maria Gomez': {'success': 0.85, 'repeat_prob_fail': 0.5}
}

# Update success and repeat_visit based on tech
for tech, params in performance.items():
    mask = df['tech_name'] == tech
    df.loc[mask, 'success'] = np.random.rand(mask.sum()) < params['success']
    fail_mask = mask & (df['success'] == False)
    df.loc[fail_mask, 'repeat_visit'] = np.random.rand(fail_mask.sum()) < params['repeat_prob_fail']


In [263]:
# Ensure timestamps are datetime
df['scheduled_time_dt'] = pd.to_datetime(df['scheduled_time_dt'])
df['arrival_time_dt'] = pd.to_datetime(df['arrival_time_dt'])

# Add normal noise (mean +7 min, std 15 min), cap ±60 min
noise = np.random.normal(loc=7, scale=15, size=len(df))
noise = np.clip(noise, -60, 60)
df['arrival_time_dt'] = df['scheduled_time_dt'] + pd.to_timedelta(noise, unit='m')


In [264]:
# Assume existing customer IDs: df['customer_id']
existing_customers = df['customer_id'].dropna().unique()

# Assign customers to ~85% of rows
mask = np.random.rand(len(df)) < 0.85
df.loc[mask, 'customer_id'] = np.random.choice(existing_customers, size=mask.sum())

# Optional: leave some NaNs for first-time customers


In [265]:
# ---------------------------------------------------------
# Save cleaned output
# ---------------------------------------------------------
df.to_csv("tech_performance_cleaned.csv", index=False)

print("Cleaning complete! Saved as tech_performance_cleaned.csv")

Cleaning complete! Saved as tech_performance_cleaned.csv
