<a href="https://colab.research.google.com/github/AdityaMali918/Python/blob/main/Pydantic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pydantic
!pip install "pydantic[email]"

Collecting email-validator>=2.0.0 (from pydantic[email])
  Downloading email_validator-2.2.0-py3-none-any.whl.metadata (25 kB)
Collecting dnspython>=2.0.0 (from email-validator>=2.0.0->pydantic[email])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading email_validator-2.2.0-py3-none-any.whl (33 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, email-validator
Successfully installed dnspython-2.7.0 email-validator-2.2.0


In [None]:
from pydantic import BaseModel,EmailStr,AnyUrl,Field
from typing import List,Dict,Optional,Annotated #8 usinf fiel and annotaed Not only data validation but u can add metadata also for description

class Patient(BaseModel):
    name:Annotated[str,Field(max_length=20,title="Name of person",description="Name of the patient",examples=['Amit','Om'])]
    email: EmailStr #5.Email validator pip install "pydantic[email]"
    linkedin_url : AnyUrl #6. Url validator
    age:int = Field(default=21,gt=18,lt=60) #7. Validation min_length,max_digits ...etc
    weight:Annotated[float,Field(gt=0,strict=True)] #4.U can give default value to any field #9 strict no type casting from string to float
    married:Optional[bool] = None #3.Whenever u give optional give that field default value "compulsory"
    allergies:Annotated[Optional[List[str]],Field(default=None,max_length=5,description="Allergy name")]#7. maxitem 5  # 1. We use this List because this is two level validation where it should be list and data inside is string
    contact_details:Dict[str,str]  # key is string ,value is string
#age tycasting str->int but int->str no
patient_info = {"name":"Aditya","email":"aditya@gmail.com","age":'21',"weight":20,"linkedin_url":"http://linked_di.com","allergies":['pollen','dust'],"contact_details":{"phone":"745553214"}}
# 2. by everythin is default requried
patient1 = Patient(**patient_info)

def insert_patient(patient:Patient):
    print(patient.name)
    print(patient.email)
    print(patient.age)
    print(patient.married)
    print(patient.weight)

insert_patient(patient1)

Aditya
aditya@gmail.com
21
None
20.0


Collecting email-validator>=2.0.0 (from pydantic[email])
  Downloading email_validator-2.2.0-py3-none-any.whl.metadata (25 kB)
Collecting dnspython>=2.0.0 (from email-validator>=2.0.0->pydantic[email])
  Downloading dnspython-2.7.0-py3-none-any.whl.metadata (5.8 kB)
Downloading email_validator-2.2.0-py3-none-any.whl (33 kB)
Downloading dnspython-2.7.0-py3-none-any.whl (313 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m313.6/313.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: dnspython, email-validator
Successfully installed dnspython-2.7.0 email-validator-2.2.0


## Field validator & Model Validator

In [None]:
from pydantic import BaseModel,EmailStr,AnyUrl,Field ,field_validator,model_validator
from typing import List,Dict

class Patient(BaseModel):
  name : str
  email : EmailStr
  age : int
  weight : float
  married : bool
  allergies : List[str]
  contact_details : Dict[str,str]



  @field_validator('email')
  @classmethod
  def email_validator(cls,value):
    valid_domain = ['hdfc.com','icic.com']
    domain_name = value.split('@')[-1]
    if domain_name not in valid_domain:
      raise ValueError("Not Valid email")
    return value

  @field_validator('name')
  @classmethod
  def transform_name(cls,value):
      return value.upper()

  @field_validator('age', mode='after')
  @classmethod
  def validate_age(cls, value):
        if 0 < value < 100:
            return value
        else:
            raise ValueError('Age should be in between 0 and 100')

  @model_validator(mode='after')
  def validate_emergency_contact(cls,model):
    if model.age>60 and 'emergency' not in model.contact_details:
      raise ValueError("Patient older than 60 must have emergency number")

    return model

patient_info = {"name":"Aditya","email":"aditya@hdfc.com","age":62,"weight":20.0,"married":True,"allergies":['pollen','dust'],"contact_details":{"phone":"745553214","emergency":"7849612"}}
patient1  = Patient(**patient_info)
print(patient1)

name='ADITYA' email='aditya@hdfc.com' age=62 weight=20.0 married=True allergies=['pollen', 'dust'] contact_details={'phone': '745553214', 'emergency': '7849612'}


## Computed Field

In [None]:
from pydantic import BaseModel,EmailStr,AnyUrl,Field ,field_validator,model_validator,computed_field
from typing import List,Dict

class Patient(BaseModel):
  name : str
  email : EmailStr
  age : int
  weight : float #kg
  height : float #mtr
  married : bool
  allergies : List[str]
  contact_details : Dict[str,str]

  @computed_field
  @property
  def calculate_bmi(self) -> float:
    bmi = round(self.weight/self.height**2,2)
    return bmi

def update_patient_data(patient: Patient):

    print(patient.name)
    print(patient.age)
    print(patient.allergies)
    print(patient.married)
    print('BMI', patient.calculate_bmi)  # Important Same name as function name
    print('updated')

patient_info = {'name':'nitish', 'email':'abc@icici.com', 'age': '22', 'weight': 75.2, 'height': 1.72, 'married': True, 'allergies': ['pollen', 'dust'], 'contact_details':{'phone':'2353462', 'emergency':'235236'}}

patient1 = Patient(**patient_info)

update_patient_data(patient1)

nitish
22
['pollen', 'dust']
True
BMI 25.42
updated


## Nested

In [None]:
from pydantic import BaseModel

class Address(BaseModel):

    city: str
    state: str
    pin: str

class Patient(BaseModel):

    name: str
    gender: str
    age: int
    address: Address

address_dict = {'city': 'gurgaon', 'state': 'haryana', 'pin': '122001'}

address1 = Address(**address_dict)

patient_dict = {'name': 'nitish', 'gender': 'male', 'age': 35, 'address': address1}

patient1 = Patient(**patient_dict)
patient1

Patient(name='nitish', gender='male', age=35, address=Address(city='gurgaon', state='haryana', pin='122001'))

#Serialization

In [None]:
from pydantic import BaseModel

class Address(BaseModel):

    city: str
    state: str
    pin: str

class Patient(BaseModel):

    name: str
    gender: str = 'Male'
    age: int
    address: Address

address_dict = {'city': 'gurgaon', 'state': 'haryana', 'pin': '122001'}

address1 = Address(**address_dict)

patient_dict = {'name': 'nitish', 'age': 35, 'address': address1}

patient1 = Patient(**patient_dict)

temp = patient1.model_dump(include=['name','address']) # include
temp1 = patient1.model_dump(exclude=['name','address']) # include
print(temp)
print(temp1)

{'name': 'nitish', 'address': {'city': 'gurgaon', 'state': 'haryana', 'pin': '122001'}}
{'gender': 'Male', 'age': 35}


# Model building


In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report,accuracy_score
import numpy as np

In [None]:
df = pd.read_csv('/content/insurance.csv')

In [None]:
df.head()

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
0,67,119.8,1.56,2.92,False,Jaipur,retired,High
1,36,101.1,1.83,34.28,False,Chennai,freelancer,Low
2,39,56.8,1.64,36.64,False,Indore,freelancer,Low
3,22,109.4,1.55,3.34,True,Mumbai,student,Medium
4,69,62.2,1.6,3.94,True,Indore,retired,High


In [None]:
df.sample(5)

Unnamed: 0,age,weight,height,income_lpa,smoker,city,occupation,insurance_premium_category
44,59,77.0,1.6,50.0,True,Lucknow,private_job,Medium
80,56,95.8,1.67,50.0,False,Jalandhar,unemployed,High
49,23,106.6,1.58,2.29,False,Kota,student,Medium
22,57,106.4,1.83,30.0,False,Chandigarh,government_job,Low
46,42,83.0,1.57,25.57,True,Kolkata,unemployed,High


In [None]:
df['occupation'].unique()

array(['retired', 'freelancer', 'student', 'government_job',
       'business_owner', 'unemployed', 'private_job'], dtype=object)

In [None]:
df_feat = df.copy()

In [None]:
# 1 BMI feature
df_feat['bmi'] = df_feat['weight']/(df_feat['height']**2)

In [None]:
# Feature 2: Age Group
def age_group(age):
    if age < 25:
        return "young"
    elif age < 45:
        return "adult"
    elif age < 60:
        return "middle_aged"
    return "senior"

In [None]:
df_feat["age_group"] = df_feat['age'].apply(age_group)

In [None]:
# Feature 3: Lifestyle Risk
def lifestyle_risk(row):
  if row["smoker"] and row["bmi"]>30:
    return "high"
  elif row["smoker"] or row["bmi"]>27:
    return "medium"
  else:
    return "low"

In [None]:
df_feat["lifestyle_risk"] = df_feat.apply(lifestyle_risk,axis=1)

In [None]:
tier_1_cities = ["Mumbai", "Delhi", "Bangalore", "Chennai", "Kolkata", "Hyderabad", "Pune"]
tier_2_cities = [
    "Jaipur", "Chandigarh", "Indore", "Lucknow", "Patna", "Ranchi", "Visakhapatnam", "Coimbatore",
    "Bhopal", "Nagpur", "Vadodara", "Surat", "Rajkot", "Jodhpur", "Raipur", "Amritsar", "Varanasi",
    "Agra", "Dehradun", "Mysore", "Jabalpur", "Guwahati", "Thiruvananthapuram", "Ludhiana", "Nashik",
    "Allahabad", "Udaipur", "Aurangabad", "Hubli", "Belgaum", "Salem", "Vijayawada", "Tiruchirappalli",
    "Bhavnagar", "Gwalior", "Dhanbad", "Bareilly", "Aligarh", "Gaya", "Kozhikode", "Warangal",
    "Kolhapur", "Bilaspur", "Jalandhar", "Noida", "Guntur", "Asansol", "Siliguri"
]

In [None]:
# Feature 4: City Tier
def city_tier(city):
    if city in tier_1_cities:
        return 1
    elif city in tier_2_cities:
        return 2
    else:
        return 3

In [None]:
df_feat["city_tier"] = df_feat["city"].apply(city_tier)

In [None]:
df_feat.drop(columns=['age', 'weight', 'height', 'smoker', 'city'])

Unnamed: 0,income_lpa,occupation,insurance_premium_category,bmi,age_group,lifestyle_risk,city_tier
0,2.92000,retired,High,49.227482,senior,medium,2
1,34.28000,freelancer,Low,30.189017,adult,medium,1
2,36.64000,freelancer,Low,21.118382,adult,low,2
3,3.34000,student,Medium,45.535900,young,high,1
4,3.94000,retired,High,24.296875,senior,medium,2
...,...,...,...,...,...,...,...
95,19.64000,business_owner,Low,21.420747,adult,low,2
96,34.01000,private_job,Low,47.984483,adult,medium,1
97,44.86000,freelancer,Low,18.765432,middle_aged,low,1
98,28.30000,business_owner,Low,30.521676,adult,medium,1


In [None]:
# Select features and target
X = df_feat[["bmi", "age_group", "lifestyle_risk", "city_tier", "income_lpa", "occupation"]]
y = df_feat["insurance_premium_category"]

In [None]:
X

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
0,49.227482,senior,medium,2,2.92000,retired
1,30.189017,adult,medium,1,34.28000,freelancer
2,21.118382,adult,low,2,36.64000,freelancer
3,45.535900,young,high,1,3.34000,student
4,24.296875,senior,medium,2,3.94000,retired
...,...,...,...,...,...,...
95,21.420747,adult,low,2,19.64000,business_owner
96,47.984483,adult,medium,1,34.01000,private_job
97,18.765432,middle_aged,low,1,44.86000,freelancer
98,30.521676,adult,medium,1,28.30000,business_owner


In [None]:
y

Unnamed: 0,insurance_premium_category
0,High
1,Low
2,Low
3,Medium
4,High
...,...
95,Low
96,Low
97,Low
98,Low


In [None]:

# Define categorical and numeric features
categorical_features = ["age_group", "lifestyle_risk", "occupation", "city_tier"]
numeric_features = ["bmi", "income_lpa"]

In [None]:
# Create column transformer for OHE

preprocessor  = ColumnTransformer(
    transformers =[
        ("cat",OneHotEncoder(),categorical_features),
        ("num","passthrough",numeric_features)
    ]
)

In [None]:
# Create a pipeline with preprocessing and random forest classifier
pipeline = Pipeline(steps=[
    ("preprocessor",preprocessor),
    ("classifier",RandomForestClassifier(random_state=42))
])

In [None]:
# Split data and train model
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)
pipeline.fit(X_train,y_train)

In [None]:
# Predict and evaluate
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.9

In [None]:
X_test.sample(5)

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
80,34.350461,middle_aged,medium,2,50.0,unemployed
81,31.866055,adult,high,2,22.19,freelancer
52,47.34472,young,medium,2,2.96,student
39,35.643424,middle_aged,high,1,11.99,unemployed
56,42.414152,young,high,1,2.86,student


In [None]:
import pickle
pickle_model_path = "model.pkl"
with open(pickle_model_path,"wb") as f:
  pickle.dump(pipeline,f)

In [None]:
X_test

Unnamed: 0,bmi,age_group,lifestyle_risk,city_tier,income_lpa,occupation
80,34.350461,middle_aged,medium,2,50.0,unemployed
84,28.801497,senior,medium,2,0.62,retired
33,21.791064,senior,low,1,1.46,retired
81,31.866055,adult,high,2,22.19,freelancer
93,23.199416,young,low,2,1.28,student
17,31.176471,senior,medium,1,2.23,retired
36,21.713266,senior,low,1,0.53,retired
82,17.874812,adult,low,1,12.96,unemployed
69,21.942857,middle_aged,low,2,6.034487,government_job
65,37.662982,middle_aged,high,2,38.07,unemployed
