In [None]:
import csv

#------------------------------------------------------------
# turns actual education to a group
#------------------------------------------------------------
def edu_to_category(edu):
    #Less than a high school education
    if edu in ["Preschool", "1st-4th", "5th-6th", "7th-8th", "9th", "10th", "11th", "12th", "Less-HS"]:
        return "Less-HS"
    
    
    #Highschool grads and more but all before a bachelors
    if edu in ["HS-grad", "Some-college", "Assoc-voc", "Assoc-acdm"]:
        return "HS-grad+"
    
    #Bachelors and more
    if edu in ["Bachelors", "Masters", "Doctorate", "Prof-school"]:
        return "Bachelors+"

    return "Other"

#------------------------------------------------------------
# Estimate Education
#------------------------------------------------------------
def estimate_education(age, workclass, occupation, hours_per_week, income, capital_gain, capital_loss):
    
    # Rule 1: If someone is under 18 it is unlikely they have graduated highschool
    #This correctly guesses 1.18% (384/32561) of the data (11 are incorrect) (this is when used on their own)
    if age < 18:
        return "Less-HS"

    #Rule 2: high income and occupations like Exec-managerial and Prof-specialty Bachelors+
    #This correctly guesses 8.65% (2818/32561) (1009 are incorrect)
    if income == ">50K" and occupation in ["Exec-managerial"]:
        return "Bachelors"

    # Rule 3: high hours but not in the high occupations high school or some college
    # This correctly guesses 3.80% (1238/32561) (512 are incorrect)
    if hours_per_week > 55 and occupation not in ["Exec-managerial", "Prof-specialty"]:
        return "HS-grad"
    

    # Rule 4: working low hours for a low wage at a low edu. required occupation 
    # 3.33% (1085/32561 correct)  (Incorrect 633)
    if hours_per_week < 35 and income == "<=50K" and occupation in ["Handlers-cleaners", "Other-service", "Farming-fishing", "Priv-house-serv"]:
        return "HS-grad"

    # Rule 5: High hours, private and a mid-level job Some-college
    # 3.89% (1267/32561 correct) Incorrect: 506
    if hours_per_week > 45 and workclass == "Private" and occupation in [
        "Craft-repair", "Transport-moving", "Sales"
    ]:
        return "HS-grad"
    
    # Rule 6: people who work as in Prof-specialty are more likely to have a degree or more
    if occupation == "Prof-specialty":
        return "Bachelors"
    
    # Rule 7: if older and low-earning likely a high school grad
    # 8.55% (2785/32561 correct) Incorrect: 1692
    if age >= 40 and income == "<=50K":
        return "HS-grad"

    # Rule 8: working full-time as an older adult is likely to be a high school graduate
    #34.29% (11164/32561) Incorrect: 7492 (this should be accounted for by the rules above)
    if hours_per_week >= 40 and age >= 30:
        return  "HS-grad"
    
    # Rule 9: having capital_gain or capital_loss suggests high school education 
    if capital_gain > 0 or capital_loss > 0:
        # people working in these occupations with capital_gain or capital_loss are likely to have a degree
        if occupation in ["Exec-managerial"]:
            return "Bachelors"
        return "HS-grad"

    # Rule 10: people who work in these occupations are more likely to have less than a high school education
    if occupation == "Priv-house-serv":
        return "Less-HS"
    
    # Fallback 
    return "no"


# without fallback Accuracy: Accuracy: 53.83% (17527/32561 correct)
# with fallback HS-grad Accuracy: 67.26% (21900/32561 correct) 
    # (accuracy after removing row with ? Accuracy: 67.55% (20375/30162 correct))
# with fallback Bachelors Accuracy: Accuracy: 56.22% (18305/32561 correct)



correct = 0
total = 0
incorrect_predictions = []
#------------------------------------------------------------
# Test database
#------------------------------------------------------------
with open('adult.data', 'r') as file:
    reader = csv.reader(file)
    import operator

    # Iterate through rows
    for row in reader:

        # Skip rows with fewer than 15 columns (otherwise it wont let me read from file)
        if len(row) < 15:
            continue

        # if fields have ?s
        if not any("?" in cell for cell in row):

            # Convert to int https://www.geeksforgeeks.org/python/convert-string-to-integer-in-python/
            # Only used variables that were relevent to the Rule-based AI          
            age = int(row[0].strip())
            workclass = row[1].strip()
            occupation = row[6].strip()
            hours_per_week = int(row[12].strip())
            income = row[14].strip()
            capital_gain = int(row[10].strip())
            capital_loss = int(row[11].strip())

            edu_num = int(row[4].strip())
            actual_education = row[3].strip()


            predicted = estimate_education(age, workclass, occupation, hours_per_week,income, capital_gain, capital_loss)
            
            if edu_to_category(actual_education) == edu_to_category("Bachelors"):
            #add the rule to the if statement to find the incorrect guesses 
                if edu_to_category(predicted) != edu_to_category(actual_education) and edu_to_category(predicted) == edu_to_category("Less-HS"):
                    
                    incorrect_predictions.append({
                        "age": age, "workclass": workclass, "occupation": occupation, "hours_per_week": hours_per_week, "income": income, "capital_gain": capital_gain, "capital_loss": capital_loss, "edu_level": edu_num, "predicted": predicted, "actual": actual_education
                    })
                else:
                    correct += 1
                
                total += 1

#------------------------------------------------------------
#Calculate accuracy and sort incorrect results (for checking for comment mistakes)
#------------------------------------------------------------
accuracy = correct / total * 100

incorrect_predictions = sorted(
    incorrect_predictions,
    key=lambda x: x["edu_level"],
    reverse=True
)

#accuracy to 2dp (:.2f)
#this shows the precentage correct as well as the fraction
print(f"Accuracy: {accuracy:.2f}% ({correct}/{total} correct)")


for item in incorrect_predictions:
    print(
        f"{item['age']}\t"
        f"{item['workclass']}\t"
        f"{item['occupation']}\t"
        f"{item['hours_per_week']}\t"
        f"{item['income']}\t"
        f"{item['capital_gain']}\t"
        f"{item['capital_loss']}\t"
        f"{item['edu_level']}\t"
        f" Predicted: {item['predicted']}\t"
        f" Actual: {item['actual']}"
    )


print(f"Total Incorrect: {len(incorrect_predictions)}")

Accuracy: 67.80% (5145/7588 correct)
44	Private	Adm-clerical	55	>50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
32	Self-emp-not-inc	Exec-managerial	40	<=50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
53	State-gov	Adm-clerical	50	>50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
41	Private	Sales	50	>50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
33	Private	Sales	60	>50K	99999	0	16	 Predicted: HS-grad	 Actual: Doctorate
51	Private	Craft-repair	60	>50K	4787	0	16	 Predicted: HS-grad	 Actual: Doctorate
47	Self-emp-not-inc	Craft-repair	50	<=50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
72	Local-gov	Exec-managerial	40	<=50K	0	1258	16	 Predicted: HS-grad	 Actual: Doctorate
67	Self-emp-not-inc	Sales	40	>50K	20051	0	16	 Predicted: HS-grad	 Actual: Doctorate
50	Self-emp-not-inc	Sales	64	<=50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
59	Private	Sales	40	<=50K	0	0	16	 Predicted: HS-grad	 Actual: Doctorate
45	Private	Exec-managerial	60	<=50K	0	0	16	 Predicted: HS-grad	 Actual: Doc