In [0]:
%run /Users/sunnynuri12@gmail.com/Big_project_01/Data_Loader

In [0]:
##calling Data Loader
dl=DataLoader("/FileStore/Big_Proj_01/Bronze_Layer/Insurance_Company.csv")
return_df=dl.create_df()


In [0]:
from pyspark.sql.functions import col, date_format, to_date

## getting a df from loader and cleaning the data.
class DataCleaning():

    ## init method
    def __init__(self,df:'Dataframe'):
        self.df=df

    def clean_data(self):

        ## Standardizing dates
        standard_dates=return_df.withColumn("Opened",to_date("Opened","MM/dd/yyyy")).withColumn("Closed",to_date("Closed","MM/dd/yyyy"))

        ## marking not available in all string columns where null is present
        cleaned_df=standard_dates.fillna({"Coverage":"Not Available",
                                "SubCoverage":"Not Available",
                                "Reason":"Not Available",
                                "SubReason":"Not Available",
                                "Disposition":"Not Available",
                                "Conclusion":"Not Available",
                                "Status":"Not Available",
                                "Recovery":0})
        return cleaned_df
        
        
    

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, max, monotonically_increasing_id

class DimTables():

    def __init__(self,df:"Dataframe"):
        self.df=df
        # Create an instance of DataCleaning inside DimTables
        self.cleaned_df = DataCleaning(self.df).clean_data()

    def company_dim(self):
        # Define the schema for the final Company Table
        schema = StructType([
            StructField("CompanyID", IntegerType(), nullable=False),
            StructField("Company", StringType(), nullable=False)
        ])

        # Extract unique companies from the new data (only the Company column)
        new_company_df = self.cleaned_df.select("Company").distinct()

        # Try to load the existing Company Table
        try:
            existing_company_df = spark.table("/FileStore/Big_Proj_01/Silver_Layer/company")
        except Exception as e:
            # If the table doesn't exist, create an empty DataFrame with the correct schema
            existing_company_df = spark.createDataFrame([], schema)

        # Identify companies that are new (not present in the existing Company Table)
        new_companies_only = new_company_df.join(existing_company_df, "Company", "leftanti")

        # Find the current maximum CompanyID in the existing table
        if not existing_company_df.rdd.isEmpty():
            max_id = existing_company_df.agg(max("CompanyID")).collect()[0][0]
        else:
            max_id = 0  # If the table is empty, start from 0

        # Assign new CompanyID to new companies (starting from the next available ID)
        # Cast monotonically_increasing_id to IntegerType and add max_id to it
        new_companies_with_id = new_companies_only.withColumn("CompanyID", (monotonically_increasing_id() + max_id + 1).cast(IntegerType()))

        # Combine existing and new companies, ensuring no duplicate CompanyID
        # Enforce the correct schema for the union
        combined_company_df = existing_company_df.unionByName(new_companies_with_id.select("CompanyID", "Company"))

        # combined_company_df.display()
        # Optional: Write the combined DataFrame to a Delta table or save it back to the desired format
        combined_company_df.write.format("delta").mode("append").save("/FileStore/Big_Proj_01/Silver_Layer/company")


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType
from pyspark.sql.functions import col, max, monotonically_increasing_id

class DimTables():

    def __init__(self,df:"Dataframe"):
        self.df=df
        # Create an instance of DataCleaning inside DimTables
        self.cleaned_df = DataCleaning(self.df).clean_data()

    def create_dim_tables(self,schema,column_name,table_name):

        # Extract unique companies from the new data (only the Company column)
        distinct_df = self.cleaned_df.select(column_name).distinct()

        # distinct_df.show()

        # Try to load the existing Company Table
        try:
            existing_data_df = spark.table(f"/FileStore/Big_Proj_01/Silver_Layer/delta_tables/{table_name}")
        except Exception as e:
            # If the table doesn't exist, create an empty DataFrame with the correct schema
            existing_data_df = spark.createDataFrame([], schema)


        # Identify companies that are new (not present in the existing Company Table)
        new_data_only = distinct_df.join(existing_data_df, column_name, "leftanti")

        

        # Find the current maximum CompanyID in the existing table
        if not existing_data_df.rdd.isEmpty():
            max_id = existing_data_df.agg(max(f"{table_name}ID")).collect()[0][0]
        else:
            max_id = 0  # If the table is empty, start from 0

        # # Assign new CompanyID to new companies (starting from the next available ID)
        # # Cast monotonically_increasing_id to IntegerType and add max_id to it
        new_data_with_id = new_data_only.withColumn(f"{table_name}ID", (monotonically_increasing_id() + max_id + 1).cast(IntegerType()))


        # # Combine existing and new companies, ensuring no duplicate CompanyID
        # # Enforce the correct schema for the union
        combined_data_df = existing_data_df.unionByName(new_data_with_id.select(f"{column_name}ID", column_name))

        # #combined_company_df.display()
        # # creating delta tables
        combined_data_df.write.format("delta").mode("append").save(f"/FileStore/Big_Proj_01/Silver_Layer/delta_tables/{table_name}")
        print(f"/FileStore/Big_Proj_01/Silver_Layer/delta_tables/{table_name}")

        return combined_data_df

    def create_all_dims(self):
        # List of dimension configurations (column_name, table_name, schema)
        dim_configurations = [
            ("Company", "company", StructType([StructField("CompanyID", IntegerType(), nullable=False), StructField("Company", StringType(), nullable=False)])),

            ("Reason", "reason", StructType([StructField("ReasonID", IntegerType(), nullable=False), StructField("Reason", StringType(), nullable=False)])),

            ("SubReason", "subreason", StructType([StructField("SubReasonID", IntegerType(), nullable=False), StructField("SubReason", StringType(), nullable=False)])),

            ("Coverage", "coverage", StructType([StructField("CoverageID", IntegerType(), nullable=False), StructField("Coverage", StringType(), nullable=False)])),

            ("SubCoverage", "subcoverage", StructType([StructField("SubCoverageID", IntegerType(), nullable=False), StructField("SubCoverage", StringType(), nullable=False)])),

            ("Disposition", "disposition", StructType([StructField("DispositionID", IntegerType(), nullable=False), StructField("Disposition", StringType(), nullable=False)])),

            ("Conclusion", "conclusion", StructType([StructField("ConclusionID", IntegerType(), nullable=False), StructField("Conclusion", StringType(), nullable=False)])),

            ("Status", "status", StructType([StructField("StatusID", IntegerType(), nullable=False), StructField("Status", StringType(), nullable=False)]))
            
        ]
        
        # Loop through each dimension configuration and create dimension tables
        for column_name, table_name, schema in dim_configurations:
            self.create_dim_tables(column_name=column_name, table_name=table_name, schema=schema)
            print(column_name, table_name, schema)




/FileStore/Big_Proj_01/Silver_Layer/delta_tables/company
Company company StructType([StructField('CompanyID', IntegerType(), False), StructField('Company', StringType(), False)])
/FileStore/Big_Proj_01/Silver_Layer/delta_tables/reason
Reason reason StructType([StructField('ReasonID', IntegerType(), False), StructField('Reason', StringType(), False)])
/FileStore/Big_Proj_01/Silver_Layer/delta_tables/subreason
SubReason subreason StructType([StructField('SubReasonID', IntegerType(), False), StructField('SubReason', StringType(), False)])
/FileStore/Big_Proj_01/Silver_Layer/delta_tables/coverage
Coverage coverage StructType([StructField('CoverageID', IntegerType(), False), StructField('Coverage', StringType(), False)])
/FileStore/Big_Proj_01/Silver_Layer/delta_tables/subcoverage
SubCoverage subcoverage StructType([StructField('SubCoverageID', IntegerType(), False), StructField('SubCoverage', StringType(), False)])
/FileStore/Big_Proj_01/Silver_Layer/delta_tables/disposition
Disposition di

In [0]:
##testing for create_dim_tables function  remove later
obj1=DimTables(return_df)
df=obj1.create_dim_tables(column_name="Company", table_name="company", schema=StructType([StructField('CompanyID', IntegerType(), False), StructField('Company', StringType(), False)]))

df.limit(10).display()


CompanyID,Company
1,Crestbrook Insurance Company
2,Agent Alliance Insurance Company
3,Pacific Specialty Insurance Company
4,Pharmacists Mutual Insurance Company
5,Transamerica Advisors Life Insurance Company
6,Carolina Casualty Insurance Company
7,James River Insurance Company
8,Principal National Life Insurance Company
9,Crum & Forster Specialty Insurance Company
10,New South Insurance Company


In [0]:
# Registering a Delta table as a SQL temporary view
delta_df = spark.read.format("delta").load("/FileStore/Big_Proj_01/Silver_Layer/delta_tables/company/")
delta_df.createOrReplaceTempView("company_table")

# Now, you can query it using SQL
result = spark.sql("SELECT * FROM company_table")
result.display()


CompanyID,Company
1,Crestbrook Insurance Company
2,Agent Alliance Insurance Company
3,Pacific Specialty Insurance Company
4,Pharmacists Mutual Insurance Company
5,Transamerica Advisors Life Insurance Company
6,Carolina Casualty Insurance Company
7,James River Insurance Company
8,Principal National Life Insurance Company
9,Crum & Forster Specialty Insurance Company
10,New South Insurance Company
