In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dirty_iris dataset
dirty_iris = pd.read_csv('dirty_iris.csv')

# i. Calculate the number and percentage of complete observations
complete_obs_count = dirty_iris.dropna().shape[0]
total_obs_count = dirty_iris.shape[0]
complete_obs_percentage = (complete_obs_count / total_obs_count) * 100

print("Number of complete observations:", complete_obs_count)
print("Percentage of complete observations:", complete_obs_percentage)

# ii. Replace all special values with NA
dirty_iris.replace({'?': np.nan, 'NA': np.nan}, inplace=True)

# iii. Read rules from a separate text file
with open('rules.txt', 'r') as file:
    rules = file.readlines()
rules = [rule.strip() for rule in rules]

# Print the resulting constraint object
print("\nConstraint Object:")
for rule in rules:
    print(rule)

# iv. Determine how often each rule is broken (violated edits)
violated_edits = {}
for rule in rules:
    rule_column, rule_condition = rule.split(' should ')
    rule_condition = rule_condition.replace(' of ', ' ')
    rule_condition = rule_condition.replace(' be ', '=')
    rule_condition = rule_condition.replace(' than ', '>')
    if 'at least' in rule_condition:
        rule_condition = rule_condition.replace('at least', '>=')
    elif 'cannot exceed' in rule_condition:
        rule_condition = rule_condition.replace('cannot exceed', '<=')
    rule_condition = rule_condition.split()
    column = rule_column.lower()
    condition = ' '.join(rule_condition[:-1])
    threshold = float(rule_condition[-1])
    if condition == '>':
        violated_edits[rule] = (dirty_iris[column] <= threshold).sum()
    elif condition == '>=':
        violated_edits[rule] = (dirty_iris[column] < threshold).sum()
    elif condition == '<':
        violated_edits[rule] = (dirty_iris[column] >= threshold).sum()
    elif condition == '<=':
        violated_edits[rule] = (dirty_iris[column] > threshold).sum()
    elif condition == '=':
        if column == 'species':
            violated_edits[rule] = (dirty_iris[column].isin(['setosa', 'versicolor', 'virginica']) == False).sum()
        else:
            violated_edits[rule] = (dirty_iris[column] <= 0).sum()

# Summarize and plot the result
print("\nViolated Edits:")
for rule, count in violated_edits.items():
    print(rule, ":", count)

# v. Find outliers in sepal length using boxplot and boxplot statistics
plt.figure(figsize=(8, 6))
dirty_iris.boxplot(column='sepal_length')
plt.title('Boxplot of Sepal Length')
plt.ylabel('Sepal Length (cm)')
plt.show()

Q1 = dirty_iris['sepal_length'].quantile(0.25)
Q3 = dirty_iris['sepal_length'].quantile(0.75)
IQR = Q3 - Q1
outliers = dirty_iris[(dirty_iris['sepal_length'] < (Q1 - 1.5 * IQR)) | (dirty_iris['sepal_length'] > (Q3 + 1.5 * IQR))]
print("Outliers in Sepal Length:")
print(outliers)
