### Importing dependencies

In [3]:
import os
from enum import Enum
from typing import List

# Importing instructor: patches the OpenAI completion chat API to add a "response_model" parameter
import instructor

import pandas as pd
from openai import OpenAI
from pydantic import BaseModel, Field
from IPython.core.interactiveshell import InteractiveShell

# Enable the display of multiple dataframes in outputs when running code
InteractiveShell.ast_node_interactivity = "all"

### Loading file

In [4]:
csv_file = pd.read_csv(f"D:/Bureau/progstuff/ai_data_cleaning/customers.csv")
customers = list(csv_file["customers"])

csv_file
customers

Unnamed: 0,customers
0,"SOSA Ernesto current_age: ""24"" type_of_account..."
1,Maria Law non-premium Seattle 42 years old
2,premium Han K. Goodwill 68 premium Miami
3,Santa Monica Hilary Powell non-premium 52 yo
4,Theodora Gilmore premium_account:yes 36Y.O PARIS


['SOSA Ernesto current_age: "24" type_of_account: non_premium current_city: SAN JOSE',
 'Maria Law non-premium Seattle 42 years old',
 'premium Han K. Goodwill 68 premium Miami',
 'Santa Monica Hilary Powell non-premium 52 yo',
 'Theodora Gilmore premium_account:yes 36Y.O PARIS']

### Building model

Model is passed in the `response_model` parameter in the completion method

In [5]:
class Premium(Enum):
    """Defines the account type of the customer, either premium or non-premium"""
    PREMIUM = 1
    NON_PREMIUM = 0

class Customer(BaseModel):
    """Represents a customer, including their first name, last name, age, city, and account type"""
    first_name: str = Field(..., description="The first name of the customer")
    last_name: str = Field(..., description="The last name of the customer")
    age: int = Field(..., description="The age of the customer")
    city: str = Field(... , description="The city where the customer lives")
    premium: Premium = Field(..., description="Account type, either premium or non-premium")

class CustomerList(BaseModel):
    """A list of customers"""
    customers: List[Customer] = Field(..., description="A list of customers")

### Building function

In [6]:
instructions = [
    "All values must be standardized by having a capital letter for their first letter and non-capital letters for the rest",
    "Exclude middle name initials"
]

def cleaning(file) -> CustomerList:
    response = instructor.from_openai(OpenAI()).chat.completions.create(
        model="gpt-4-turbo",
        response_model=CustomerList,
        messages=[
            {
                "role": "user",
                "content": f"Extract information from {file} while following these instructions: {instructions}",
            },
            
        ]
    )
    return response

### Running function

In [7]:
result = cleaning(customers)

output_df = pd.DataFrame([{
    "first_name": customer.first_name,
    "last_name": customer.last_name,
    "age": customer.age,
    "city": customer.city,
    "premium": customer.premium.value
    }
    for customer in result.customers])

output_df

Unnamed: 0,first_name,last_name,age,city,premium
0,Ernesto,Sosa,24,San Jose,0
1,Maria,Law,42,Seattle,0
2,Han,Goodwill,68,Miami,1
3,Hilary,Powell,52,Santa Monica,0
4,Theodora,Gilmore,36,Paris,1


### Saving dataframe to csv

In [8]:
output_df.to_csv(f"D:/Bureau/progstuff/ai_data_cleaning/customers_cleaned.csv",index=False,header=True)

# Testing success rate

This section was used to ensure the function returned a correct output consistently

### Building target dataframe

In [None]:
# Target dataframe that the output dataframe is tested against
target_df = pd.DataFrame({
    "first_name": ["Ernesto", "Maria", "Han", "Hilary", "Theodora"] ,
    "last_name": ["Sosa", "Law", "Goodwill", "Powell", "Gilmore"],
    "age": [24, 42, 68, 52, 36] ,
    "city": ["San Jose", "Seattle", "Miami", "Santa Monica", "Paris"] ,
    "premium": [0, 0, 1, 0, 1] 
    })

target_df

### Running tests

In [302]:
# Tests results are stored in a list as "Pass" or "Fail"
# "Pass" : the function returns a dataframe identical to the target dataframe
# "Fail" : the function returns a dataframe different from the target dataframe
test_results = [] 
test_values = {} # Stores the output values of the tests

i = 0
n = 10 # Number of tests to run

while i < n:
    result = cleaning(customers)
    test_values[i] = [result]
    df = pd.DataFrame([{
        "first_name": customer.first_name,
        "last_name": customer.last_name,
        "age": customer.age,
        "city": customer.city,
        "premium": customer.premium.value
        }
        for customer in result.customers])
    
    if df.equals(target_df) == True:
        test_results.append("Pass")
    else:
        test_results.append("Fail")
    i +=1

test_results
df

['Pass',
 'Pass',
 'Pass',
 'Pass',
 'Pass',
 'Pass',
 'Pass',
 'Pass',
 'Pass',
 'Pass']

Unnamed: 0,first_name,last_name,age,city,premium
0,Ernesto,Sosa,24,San Jose,0
1,Maria,Law,42,Seattle,0
2,Han,Goodwill,68,Miami,1
3,Hilary,Powell,52,Santa Monica,0
4,Theodora,Gilmore,36,Paris,1


### Test results

In [303]:
print(f"Success rate: {test_results.count('Pass') / len(test_results) * 100:.2f}% \nNumber of tests: {n}")

Success rate: 100.00% 
Number of tests: 10
