# Milestone 3

Nama  : Akbar Fitriawan
Batch : FTDS-15-HCK

Program ini dibuat untuk melakukan great expectations berdasarkan data yang sudah di clean dari airflow

---

# Import libraries

In [18]:
import pandas as pd
import great_expectations as ge
from great_expectations.dataset.pandas_dataset import PandasDataset

# Load Data

In [19]:
df = pd.read_csv('dags/P2M3_akbar_fitriawan_data_clean.csv')
df.head()

Unnamed: 0,address,rent,beds,baths,type,area_in_sqft,rent_per_sqft,rent_category,furnishing,posted_date,age_of_listing_in_days,location,city,latitude,longitude,posted_month,posted_year
0,"The Gate Tower 2, The Gate Tower, Shams Gate D...",124000,3,4,Apartment,1785,69.467787,Medium,Unfurnished,2024-03-07,45,Al Reem Island,Abu Dhabi,24.493598,54.407841,3,2024
1,"Water's Edge, Yas Island, Abu Dhabi",140000,3,4,Apartment,1422,98.452883,Medium,Unfurnished,2024-03-08,44,Yas Island,Abu Dhabi,24.494022,54.607372,3,2024
2,"Al Raha Lofts, Al Raha Beach, Abu Dhabi",99000,2,3,Apartment,1314,75.342466,Medium,Furnished,2024-03-21,31,Al Raha Beach,Abu Dhabi,24.485931,54.600939,3,2024
3,"Marina Heights, Marina Square, Al Reem Island,...",220000,3,4,Penthouse,3843,57.246942,High,Unfurnished,2024-02-24,57,Al Reem Island,Abu Dhabi,24.493598,54.407841,2,2024
4,"West Yas, Yas Island, Abu Dhabi",350000,5,7,Villa,6860,51.020408,High,Unfurnished,2024-02-16,65,Yas Island,Abu Dhabi,24.494022,54.607372,2,2024


In [77]:
# Function to generate unique_id based on index, address, and city
def generate_unique_id(row):
    unique_id = f'{row.name}_0{row["address"][:2]}{row["city"][:2].upper()}'
    return unique_id

# Apply the function to create unique_id column
df['unique_id'] = df.apply(generate_unique_id, axis=1)

# Print some rows to verify
print(df['unique_id'].head())

0    0_0ThAB
1    1_0WaAB
2    2_0AlAB
3    3_0MaAB
4    4_0WeAB
Name: unique_id, dtype: object


In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73727 entries, 0 to 73726
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   address                 73727 non-null  object 
 1   rent                    73727 non-null  int64  
 2   beds                    73727 non-null  int64  
 3   baths                   73727 non-null  int64  
 4   type                    73727 non-null  object 
 5   area_in_sqft            73727 non-null  int64  
 6   rent_per_sqft           73727 non-null  float64
 7   rent_category           73727 non-null  object 
 8   furnishing              73727 non-null  object 
 9   posted_date             73727 non-null  object 
 10  age_of_listing_in_days  73727 non-null  int64  
 11  location                73727 non-null  object 
 12  city                    73727 non-null  object 
 13  latitude                73727 non-null  float64
 14  longitude               73727 non-null

In [80]:
# konversi ke great expectations
ge_df = PandasDataset(df)

# Great expectations

schema sebagai berikut :

- unique_id harus unique
- rent tidak ada yang minus
- beds min 0 sampai 12
- baths min 1 sampai 12
- furnishing hanya dua kategori
- rent_category hanya tiga kategori
- latitude dan longitude harus tipe data float
- age_of_listing_in_days harus int64


In [82]:
# 1. Expect 'address' to be unique
expectation_unique_id = ge_df.expect_column_values_to_be_unique('unique_id')
print("Expectation 1 (unique id): ", expectation_unique_id["success"])

Expectation 1 (unique id):  True


In [83]:
# 2. Expect 'rent' to be non-negative
expectation_rent_non_negative = ge_df.expect_column_min_to_be_between('rent', min_value=0, max_value=None)
print("Expectation 2 (rent non-negative): ", expectation_rent_non_negative["success"])

Expectation 2 (rent non-negative):  True


In [84]:
# 3. Expect 'beds' to be between 1 and 13
expectation_beds_range = ge_df.expect_column_values_to_be_between('beds', min_value=0, max_value=12)
print("Expectation 3 (beds range): ", expectation_beds_range["success"])

Expectation 3 (beds range):  True


In [85]:
# 4. Expect 'baths' to be between 1 and 12
expectation_baths_range = ge_df.expect_column_values_to_be_between('baths', min_value=1, max_value=12)
print("Expectation 4 (baths range): ", expectation_baths_range["success"])

Expectation 4 (baths range):  True


In [86]:
# Expect 'furnishing' to be one of two categories
expectation_furnishing_categories = ge_df.expect_column_distinct_values_to_be_in_set('furnishing', ['Unfurnished', 'Furnished'])
# Print success for Expectation 5 (furnishing categories)
print("Expectation 5 (furnishing categories): ", expectation_furnishing_categories["success"])

Expectation 5 (furnishing categories):  True


In [87]:
# 6. Expect 'rent_category' to be one of three categories
expectation_rent_category_categories = ge_df.expect_column_distinct_values_to_be_in_set('rent_category', ['Low', 'Medium', 'High'])
print("Expectation 6 (rent_category categories): ", expectation_rent_category_categories["success"])

Expectation 6 (rent_category categories):  True


In [88]:
# 7. Expect 'latitude' and 'longitude' to be of float data type
expectation_latitude_type = ge_df.expect_column_values_to_be_of_type('latitude', 'float')
expectation_longitude_type = ge_df.expect_column_values_to_be_of_type('longitude', 'float')
print("Expectation 7 (latitude type): ", expectation_latitude_type["success"])
print("Expectation 8 (longitude type): ", expectation_longitude_type["success"])

Expectation 7 (latitude type):  True
Expectation 8 (longitude type):  True


In [91]:
# 9. Expect 'age_of_listing_in_days' to be of int64 type
expectation_age_of_listing_type = ge_df.expect_column_values_to_be_of_type('age_of_listing_in_days', 'int64')
print("Expectation 9 (age_of_listing_in_days type): ", expectation_age_of_listing_type["success"])

Expectation 9 (age_of_listing_in_days type):  True
