In [None]:
from pyspark.sql import SparkSession

#créer une session dans le master
spark = SparkSession.builder \
    .master("spark://172.20.53.96:7077") \
    .appName("WDC-complete") \
    .config("spark.executor.memory","28g") \
    .config("spark.driver.memory","28g") \
    .getOrCreate()
# spark = SparkSession.builder.master("local").appName("WDC-complete").getOrCreate()

spark.conf.set("spark.worker.cleanup.enabled",True)
spark.conf.set("spark.worker.cleanup.interval",1800)
spark.conf.set("spark.worker.cleanup.appDataTtl",3600)
spark.conf.set("spark.sql.shuffle.partitions",1000)

#fichiers de config qui permettent de se connecter au serveur de stockage s3 qui contient les fichiers de DataCommons
endpoint_url = 'https://s3.os-bird.glicid.fr/'
aws_access_key_id = 'bbd95ea3c1174caa88345404b84e458f'
aws_secret_access_key = 'eaf2a72ecf9845f583af7f3513c44f25'
hadoopConf = spark._jsc.hadoopConfiguration()
hadoopConf.set('fs.s3a.access.key', aws_access_key_id)
hadoopConf.set('fs.s3a.secret.key', aws_secret_access_key)
hadoopConf.set('fs.s3a.endpoint', endpoint_url)
hadoopConf.set('fs.s3a.path.style.access', 'true')
hadoopConf.set('fs.s3a.impl', 'org.apache.hadoop.fs.s3a.S3AFileSystem')

hadoopConf.set('spark.worker.cleanup.enabled', 'true')
hadoopConf.set('fs.s3a.committer.name', 'magic')

In [None]:
from pyspark.sql import functions as f

from pyspark.sql.functions import split
from pyspark.sql.functions import col

readavant = spark.read.option("header",True) \
  .csv("s3a://test-out/wdcfix/**")

csavant=readavant.groupby("pset").agg(f.sum("count").alias('count')).sort(f.desc("count"))

csavant.createOrReplaceTempView("CSET_avant")
csavant.show(truncate=150)

readapres = spark.read.option("header",True) \
  .csv("s3a://test-out/cset-wdc-2023-fix2/**")

csapres=readapres.groupby("pset").agg(f.sum("count").alias('count')).sort(f.desc("count"))

# fix wrong formatting
csapres = csapres.withColumn("pset", f.regexp_replace(f.col("pset"), "([Hh][Tt][Tt][Pp][Ss]?://)?([Ww]{3}\.)?", ""))

csapres.createOrReplaceTempView("CSET_apres")
csapres.show(truncate=150)

In [None]:
# calcule le count, l'average et le coverage d'un type de données
def calculate_countavcov(data, pred):
    
    
    # Danger injection SQL
    pred = pred.replace("'", "\\'")
    sets = spark.sql(f"SELECT pset, count FROM {data} WHERE pset LIKE '%{pred}%'")
    sets = sets.withColumn("pset", split(sets["pset"], " "))
    
    count_sum = sets.agg({"count": "sum"}).collect()[0][0]
    count_used = sets.selectExpr("sum(size(pset) * count) as count_used").collect()[0][0]
    
    if count_sum is None or count_used is None : 
        return Row(type=pred, count=float(0), average=float(0), coverage=float(0))

    distinct_predicate_count = sets.selectExpr("explode(pset) as predicate").distinct().count()
    
    average = count_used / count_sum
    
    limit10_sets = sets.limit(10)
    limit10_count_sum = limit10_sets.agg({"count": "sum"}).collect()[0][0]
    limit10_count_used = limit10_sets.selectExpr("sum(size(pset) * count) as count_used").collect()[0][0]
    limit10_distinct_predicate_count = limit10_sets.selectExpr("explode(pset) as predicate").distinct().count()
    
    coverage = limit10_count_used / (limit10_count_sum * limit10_distinct_predicate_count)
    
    
    print(f" {pred}:  ; count = {count_sum} ; average = {average} ; coverage = {coverage}")
    return Row(type=pred, count=float(count_sum), average=float(average), coverage=float(coverage))

In [None]:
from pyspark.sql import Row
import json

rowsavant = []
rowsapres = []

# lit dans le JSON possédant les noms des types de schema.org
with open('updatedTypesWithoutIntangibles.json', 'r') as schema_type_json:
    # Chargement du contenu du fichier JSON dans une liste
    type_name_list = json.load(schema_type_json)


#type_name_list = ["isa:<schema.org/Thing>","isa:<schema.org/Action>","isa:<schema.org/AchieveAction>","isa:<schema.org/LoseAction>","isa:<schema.org/TieAction>","isa:<schema.org/WinAction>","isa:<schema.org/AssessAction>","isa:<schema.org/ChooseAction>","isa:<schema.org/VoteAction>","isa:<schema.org/IgnoreAction>","isa:<schema.org/ReactAction>","isa:<schema.org/AgreeAction>","isa:<schema.org/DisagreeAction>","isa:<schema.org/DislikeAction>","isa:<schema.org/EndorseAction>","isa:<schema.org/LikeAction>","isa:<schema.org/WantAction>","isa:<schema.org/ReviewAction>","isa:<schema.org/ConsumeAction>","isa:<schema.org/DrinkAction>","isa:<schema.org/EatAction>","isa:<schema.org/InstallAction>","isa:<schema.org/ListenAction>","isa:<schema.org/PlayGameAction>","isa:<schema.org/ReadAction>","isa:<schema.org/UseAction>","isa:<schema.org/WearAction>","isa:<schema.org/ViewAction>","isa:<schema.org/WatchAction>","isa:<schema.org/ControlAction>","isa:<schema.org/ActivateAction>","isa:<schema.org/DeactivateAction>","isa:<schema.org/ResumeAction>","isa:<schema.org/SuspendAction>","isa:<schema.org/CreateAction>","isa:<schema.org/CookAction>","isa:<schema.org/DrawAction>","isa:<schema.org/FilmAction>","isa:<schema.org/PaintAction>","isa:<schema.org/PhotographAction>","isa:<schema.org/WriteAction>","isa:<schema.org/FindAction>","isa:<schema.org/CheckAction>","isa:<schema.org/DiscoverAction>","isa:<schema.org/TrackAction>","isa:<schema.org/InteractAction>","isa:<schema.org/BefriendAction>","isa:<schema.org/CommunicateAction>","isa:<schema.org/AskAction>","isa:<schema.org/CheckInAction>","isa:<schema.org/CheckOutAction>","isa:<schema.org/CommentAction>","isa:<schema.org/InformAction>","isa:<schema.org/ConfirmAction>","isa:<schema.org/RsvpAction>","isa:<schema.org/InviteAction>","isa:<schema.org/ReplyAction>","isa:<schema.org/ShareAction>","isa:<schema.org/FollowAction>","isa:<schema.org/JoinAction>","isa:<schema.org/LeaveAction>","isa:<schema.org/MarryAction>","isa:<schema.org/RegisterAction>","isa:<schema.org/SubscribeAction>","isa:<schema.org/UnRegisterAction>","isa:<schema.org/MoveAction>","isa:<schema.org/ArriveAction>","isa:<schema.org/DepartAction>","isa:<schema.org/TravelAction>","isa:<schema.org/OrganizeAction>","isa:<schema.org/AllocateAction>","isa:<schema.org/AcceptAction>","isa:<schema.org/AssignAction>","isa:<schema.org/AuthorizeAction>","isa:<schema.org/RejectAction>","isa:<schema.org/ApplyAction>","isa:<schema.org/BookmarkAction>","isa:<schema.org/PlanAction>","isa:<schema.org/CancelAction>","isa:<schema.org/ReserveAction>","isa:<schema.org/ScheduleAction>","isa:<schema.org/PlayAction>","isa:<schema.org/ExerciseAction>","isa:<schema.org/PerformAction>","isa:<schema.org/SearchAction>","isa:<schema.org/SeekToAction>","isa:<schema.org/SolveMathAction>","isa:<schema.org/TradeAction>","isa:<schema.org/BuyAction>","isa:<schema.org/DonateAction>","isa:<schema.org/OrderAction>","isa:<schema.org/PayAction>","isa:<schema.org/PreOrderAction>","isa:<schema.org/QuoteAction>","isa:<schema.org/RentAction>","isa:<schema.org/SellAction>","isa:<schema.org/TipAction>","isa:<schema.org/TransferAction>","isa:<schema.org/BorrowAction>","isa:<schema.org/DownloadAction>","isa:<schema.org/GiveAction>","isa:<schema.org/LendAction>","isa:<schema.org/MoneyTransfer>","isa:<schema.org/ReceiveAction>","isa:<schema.org/ReturnAction>","isa:<schema.org/SendAction>","isa:<schema.org/TakeAction>","isa:<schema.org/UpdateAction>","isa:<schema.org/AddAction>","isa:<schema.org/InsertAction>","isa:<schema.org/AppendAction>","isa:<schema.org/PrependAction>","isa:<schema.org/DeleteAction>","isa:<schema.org/ReplaceAction>","isa:<schema.org/BioChemEntity>","isa:<schema.org/ChemicalSubstance>","isa:<schema.org/Gene>","isa:<schema.org/MolecularEntity>","isa:<schema.org/Protein>","isa:<schema.org/CreativeWork>","isa:<schema.org/AmpStory>","isa:<schema.org/ArchiveComponent>","isa:<schema.org/Article>","isa:<schema.org/AdvertiserContentArticle>","isa:<schema.org/NewsArticle>","isa:<schema.org/AnalysisNewsArticle>","isa:<schema.org/AskPublicNewsArticle>","isa:<schema.org/BackgroundNewsArticle>","isa:<schema.org/OpinionNewsArticle>","isa:<schema.org/ReportageNewsArticle>","isa:<schema.org/ReviewNewsArticle>","isa:<schema.org/Report>","isa:<schema.org/SatiricalArticle>","isa:<schema.org/ScholarlyArticle>","isa:<schema.org/MedicalScholarlyArticle>","isa:<schema.org/SocialMediaPosting>","isa:<schema.org/BlogPosting>","isa:<schema.org/LiveBlogPosting>","isa:<schema.org/DiscussionForumPosting>","isa:<schema.org/TechArticle>","isa:<schema.org/APIReference>","isa:<schema.org/Atlas>","isa:<schema.org/Blog>","isa:<schema.org/Book>","isa:<schema.org/Audiobook>","isa:<schema.org/Chapter>","isa:<schema.org/Claim>","isa:<schema.org/Clip>","isa:<schema.org/MovieClip>","isa:<schema.org/RadioClip>","isa:<schema.org/TVClip>","isa:<schema.org/VideoGameClip>","isa:<schema.org/Code>","isa:<schema.org/Collection>","isa:<schema.org/ProductCollection>","isa:<schema.org/ComicStory>","isa:<schema.org/ComicCoverArt>","isa:<schema.org/Comment>","isa:<schema.org/Answer>","isa:<schema.org/CorrectionComment>","isa:<schema.org/Question>","isa:<schema.org/Conversation>","isa:<schema.org/Course>","isa:<schema.org/CreativeWorkSeason>","isa:<schema.org/PodcastSeason>","isa:<schema.org/RadioSeason>","isa:<schema.org/TVSeason>","isa:<schema.org/CreativeWorkSeries>","isa:<schema.org/BookSeries>","isa:<schema.org/MovieSeries>","isa:<schema.org/Periodical>","isa:<schema.org/ComicSeries>","isa:<schema.org/Newspaper>","isa:<schema.org/PodcastSeries>","isa:<schema.org/RadioSeries>","isa:<schema.org/TVSeries>","isa:<schema.org/VideoGameSeries>","isa:<schema.org/DataCatalog>","isa:<schema.org/Dataset>","isa:<schema.org/DataFeed>","isa:<schema.org/CompleteDataFeed>","isa:<schema.org/DefinedTermSet>","isa:<schema.org/CategoryCodeSet>","isa:<schema.org/Diet>","isa:<schema.org/DigitalDocument>","isa:<schema.org/NoteDigitalDocument>","isa:<schema.org/PresentationDigitalDocument>","isa:<schema.org/SpreadsheetDigitalDocument>","isa:<schema.org/TextDigitalDocument>","isa:<schema.org/Drawing>","isa:<schema.org/EducationalOccupationalCredential>","isa:<schema.org/Episode>","isa:<schema.org/PodcastEpisode>","isa:<schema.org/RadioEpisode>","isa:<schema.org/TVEpisode>","isa:<schema.org/ExercisePlan>","isa:<schema.org/Game>","isa:<schema.org/VideoGame>","isa:<schema.org/Guide>","isa:<schema.org/HowTo>","isa:<schema.org/Recipe>","isa:<schema.org/HowToDirection>","isa:<schema.org/HowToSection>","isa:<schema.org/HowToStep>","isa:<schema.org/HowToTip>","isa:<schema.org/HyperToc>","isa:<schema.org/HyperTocEntry>","isa:<schema.org/LearningResource>","isa:<schema.org/Quiz>","isa:<schema.org/Syllabus>","isa:<schema.org/Legislation>","isa:<schema.org/LegislationObject>","isa:<schema.org/Manuscript>","isa:<schema.org/Map>","isa:<schema.org/MathSolver>","isa:<schema.org/MediaObject>","isa:<schema.org/3DModel>","isa:<schema.org/AudioObject>","isa:<schema.org/AudioObjectSnapshot>","isa:<schema.org/DataDownload>","isa:<schema.org/ImageObject>","isa:<schema.org/Barcode>","isa:<schema.org/ImageObjectSnapshot>","isa:<schema.org/MusicVideoObject>","isa:<schema.org/TextObject>","isa:<schema.org/VideoObject>","isa:<schema.org/VideoObjectSnapshot>","isa:<schema.org/MediaReviewItem>","isa:<schema.org/Menu>","isa:<schema.org/MenuSection>","isa:<schema.org/Message>","isa:<schema.org/EmailMessage>","isa:<schema.org/Movie>","isa:<schema.org/MusicComposition>","isa:<schema.org/MusicPlaylist>","isa:<schema.org/MusicAlbum>","isa:<schema.org/MusicRelease>","isa:<schema.org/MusicRecording>","isa:<schema.org/Painting>","isa:<schema.org/Photograph>","isa:<schema.org/Play>","isa:<schema.org/Poster>","isa:<schema.org/PublicationIssue>","isa:<schema.org/ComicIssue>","isa:<schema.org/PublicationVolume>","isa:<schema.org/Quotation>","isa:<schema.org/Review>","isa:<schema.org/ClaimReview>","isa:<schema.org/CriticReview>","isa:<schema.org/EmployerReview>","isa:<schema.org/MediaReview>","isa:<schema.org/Recommendation>","isa:<schema.org/UserReview>","isa:<schema.org/Sculpture>","isa:<schema.org/Season>","isa:<schema.org/SheetMusic>","isa:<schema.org/ShortStory>","isa:<schema.org/SoftwareApplication>","isa:<schema.org/MobileApplication>","isa:<schema.org/WebApplication>","isa:<schema.org/SoftwareSourceCode>","isa:<schema.org/SpecialAnnouncement>","isa:<schema.org/Statement>","isa:<schema.org/Thesis>","isa:<schema.org/VisualArtwork>","isa:<schema.org/CoverArt>","isa:<schema.org/WebContent>","isa:<schema.org/HealthTopicContent>","isa:<schema.org/WebPage>","isa:<schema.org/AboutPage>","isa:<schema.org/CheckoutPage>","isa:<schema.org/CollectionPage>","isa:<schema.org/MediaGallery>","isa:<schema.org/ImageGallery>","isa:<schema.org/VideoGallery>","isa:<schema.org/ContactPage>","isa:<schema.org/FAQPage>","isa:<schema.org/ItemPage>","isa:<schema.org/MedicalWebPage>","isa:<schema.org/ProfilePage>","isa:<schema.org/QAPage>","isa:<schema.org/RealEstateListing>","isa:<schema.org/SearchResultsPage>","isa:<schema.org/WebPageElement>","isa:<schema.org/SiteNavigationElement>","isa:<schema.org/Table>","isa:<schema.org/WPAdBlock>","isa:<schema.org/WPFooter>","isa:<schema.org/WPHeader>","isa:<schema.org/WPSideBar>","isa:<schema.org/WebSite>","isa:<schema.org/Event>","isa:<schema.org/BusinessEvent>","isa:<schema.org/ChildrensEvent>","isa:<schema.org/ComedyEvent>","isa:<schema.org/CourseInstance>","isa:<schema.org/DanceEvent>","isa:<schema.org/DeliveryEvent>","isa:<schema.org/EducationEvent>","isa:<schema.org/EventSeries>","isa:<schema.org/ExhibitionEvent>","isa:<schema.org/Festival>","isa:<schema.org/FoodEvent>","isa:<schema.org/Hackathon>","isa:<schema.org/LiteraryEvent>","isa:<schema.org/MusicEvent>","isa:<schema.org/PublicationEvent>","isa:<schema.org/BroadcastEvent>","isa:<schema.org/OnDemandEvent>","isa:<schema.org/SaleEvent>","isa:<schema.org/ScreeningEvent>","isa:<schema.org/SocialEvent>","isa:<schema.org/SportsEvent>","isa:<schema.org/TheaterEvent>","isa:<schema.org/UserInteraction>","isa:<schema.org/UserBlocks>","isa:<schema.org/UserCheckins>","isa:<schema.org/UserComments>","isa:<schema.org/UserDownloads>","isa:<schema.org/UserLikes>","isa:<schema.org/UserPageVisits>","isa:<schema.org/UserPlays>","isa:<schema.org/UserPlusOnes>","isa:<schema.org/UserTweets>","isa:<schema.org/VisualArtsEvent>","isa:<schema.org/Intangible>","isa:<schema.org/MedicalEntity>","isa:<schema.org/AnatomicalStructure>","isa:<schema.org/Bone>","isa:<schema.org/BrainStructure>","isa:<schema.org/Joint>","isa:<schema.org/Ligament>","isa:<schema.org/Muscle>","isa:<schema.org/Nerve>","isa:<schema.org/Vessel>","isa:<schema.org/Artery>","isa:<schema.org/LymphaticVessel>","isa:<schema.org/Vein>","isa:<schema.org/AnatomicalSystem>","isa:<schema.org/DrugClass>","isa:<schema.org/DrugCost>","isa:<schema.org/LifestyleModification>","isa:<schema.org/PhysicalActivity>","isa:<schema.org/MedicalCause>","isa:<schema.org/MedicalCondition>","isa:<schema.org/InfectiousDisease>","isa:<schema.org/MedicalSignOrSymptom>","isa:<schema.org/MedicalSign>","isa:<schema.org/VitalSign>","isa:<schema.org/MedicalSymptom>","isa:<schema.org/MedicalContraindication>","isa:<schema.org/MedicalDevice>","isa:<schema.org/MedicalGuideline>","isa:<schema.org/MedicalGuidelineContraindication>","isa:<schema.org/MedicalGuidelineRecommendation>","isa:<schema.org/MedicalIndication>","isa:<schema.org/ApprovedIndication>","isa:<schema.org/PreventionIndication>","isa:<schema.org/TreatmentIndication>","isa:<schema.org/MedicalIntangible>","isa:<schema.org/DDxElement>","isa:<schema.org/DoseSchedule>","isa:<schema.org/MaximumDoseSchedule>","isa:<schema.org/RecommendedDoseSchedule>","isa:<schema.org/ReportedDoseSchedule>","isa:<schema.org/DrugLegalStatus>","isa:<schema.org/DrugStrength>","isa:<schema.org/MedicalCode>","isa:<schema.org/MedicalConditionStage>","isa:<schema.org/MedicalProcedure>","isa:<schema.org/DiagnosticProcedure>","isa:<schema.org/PalliativeProcedure>","isa:<schema.org/PhysicalExam>","isa:<schema.org/SurgicalProcedure>","isa:<schema.org/TherapeuticProcedure>","isa:<schema.org/MedicalTherapy>","isa:<schema.org/OccupationalTherapy>","isa:<schema.org/PhysicalTherapy>","isa:<schema.org/RadiationTherapy>","isa:<schema.org/::RespiratoryTherapy>","isa:<schema.org/PsychologicalTreatment>","isa:<schema.org/MedicalRiskEstimator>","isa:<schema.org/MedicalRiskCalculator>","isa:<schema.org/MedicalRiskScore>","isa:<schema.org/MedicalRiskFactor>","isa:<schema.org/MedicalStudy>","isa:<schema.org/MedicalObservationalStudy>","isa:<schema.org/MedicalTrial>","isa:<schema.org/MedicalTest>","isa:<schema.org/BloodTest>","isa:<schema.org/ImagingTest>","isa:<schema.org/MedicalTestPanel>","isa:<schema.org/PathologyTest>","isa:<schema.org/Substance>","isa:<schema.org/DietarySupplement>","isa:<schema.org/Drug>","isa:<schema.org/SuperficialAnatomy>","isa:<schema.org/Organization>","isa:<schema.org/Airline>","isa:<schema.org/Consortium>","isa:<schema.org/Corporation>","isa:<schema.org/EducationalOrganization>","isa:<schema.org/CollegeOrUniversity>","isa:<schema.org/ElementarySchool>","isa:<schema.org/HighSchool>","isa:<schema.org/MiddleSchool>","isa:<schema.org/Preschool>","isa:<schema.org/School>","isa:<schema.org/FundingScheme>","isa:<schema.org/GovernmentOrganization>","isa:<schema.org/LibrarySystem>","isa:<schema.org/LocalBusiness>","isa:<schema.org/AnimalShelter>","isa:<schema.org/ArchiveOrganization>","isa:<schema.org/AutomotiveBusiness>","isa:<schema.org/AutoBodyShop>","isa:<schema.org/AutoDealer>","isa:<schema.org/AutoPartsStore>","isa:<schema.org/AutoRental>","isa:<schema.org/AutoRepair>","isa:<schema.org/AutoWash>","isa:<schema.org/GasStation>","isa:<schema.org/MotorcycleDealer>","isa:<schema.org/MotorcycleRepair>","isa:<schema.org/ChildCare>","isa:<schema.org/Dentist>","isa:<schema.org/DryCleaningOrLaundry>","isa:<schema.org/EmergencyService>","isa:<schema.org/FireStation>","isa:<schema.org/Hospital>","isa:<schema.org/PoliceStation>","isa:<schema.org/EmploymentAgency>","isa:<schema.org/EntertainmentBusiness>","isa:<schema.org/AdultEntertainment>","isa:<schema.org/AmusementPark>","isa:<schema.org/ArtGallery>","isa:<schema.org/Casino>","isa:<schema.org/ComedyClub>","isa:<schema.org/MovieTheater>","isa:<schema.org/NightClub>","isa:<schema.org/FinancialService>","isa:<schema.org/AccountingService>","isa:<schema.org/AutomatedTeller>","isa:<schema.org/BankOrCreditUnion>","isa:<schema.org/InsuranceAgency>","isa:<schema.org/FoodEstablishment>","isa:<schema.org/Bakery>","isa:<schema.org/BarOrPub>","isa:<schema.org/Brewery>","isa:<schema.org/CafeOrCoffeeShop>","isa:<schema.org/Distillery>","isa:<schema.org/FastFoodRestaurant>","isa:<schema.org/IceCreamShop>","isa:<schema.org/Restaurant>","isa:<schema.org/Winery>","isa:<schema.org/GovernmentOffice>","isa:<schema.org/PostOffice>","isa:<schema.org/HealthAndBeautyBusiness>","isa:<schema.org/BeautySalon>","isa:<schema.org/DaySpa>","isa:<schema.org/HairSalon>","isa:<schema.org/HealthClub>","isa:<schema.org/NailSalon>","isa:<schema.org/TattooParlor>","isa:<schema.org/HomeAndConstructionBusiness>","isa:<schema.org/Electrician>","isa:<schema.org/GeneralContractor>","isa:<schema.org/HVACBusiness>","isa:<schema.org/HousePainter>","isa:<schema.org/Locksmith>","isa:<schema.org/MovingCompany>","isa:<schema.org/Plumber>","isa:<schema.org/RoofingContractor>","isa:<schema.org/InternetCafe>","isa:<schema.org/LegalService>","isa:<schema.org/Attorney>","isa:<schema.org/Notary>","isa:<schema.org/Library>","isa:<schema.org/LodgingBusiness>","isa:<schema.org/BedAndBreakfast>","isa:<schema.org/Campground>","isa:<schema.org/Hostel>","isa:<schema.org/Hotel>","isa:<schema.org/Motel>","isa:<schema.org/Resort>","isa:<schema.org/SkiResort>","isa:<schema.org/VacationRental>","isa:<schema.org/MedicalBusiness>","isa:<schema.org/::CommunityHealth>","isa:<schema.org/::Dermatology>","isa:<schema.org/::DietNutrition>","isa:<schema.org/::Emergency>","isa:<schema.org/::Geriatric>","isa:<schema.org/::Gynecologic>","isa:<schema.org/MedicalClinic>","isa:<schema.org/CovidTestingFacility>","isa:<schema.org/::Midwifery>","isa:<schema.org/::Nursing>","isa:<schema.org/::Obstetric>","isa:<schema.org/::Oncologic>","isa:<schema.org/Optician>","isa:<schema.org/::Optometric>","isa:<schema.org/::Otolaryngologic>","isa:<schema.org/::Pediatric>","isa:<schema.org/Pharmacy>","isa:<schema.org/Physician>","isa:<schema.org/::Physiotherapy>","isa:<schema.org/::PlasticSurgery>","isa:<schema.org/::Podiatric>","isa:<schema.org/::PrimaryCare>","isa:<schema.org/::Psychiatric>","isa:<schema.org/::PublicHealth>","isa:<schema.org/ProfessionalService>","isa:<schema.org/RadioStation>","isa:<schema.org/RealEstateAgent>","isa:<schema.org/RecyclingCenter>","isa:<schema.org/SelfStorage>","isa:<schema.org/ShoppingCenter>","isa:<schema.org/SportsActivityLocation>","isa:<schema.org/BowlingAlley>","isa:<schema.org/ExerciseGym>","isa:<schema.org/GolfCourse>","isa:<schema.org/PublicSwimmingPool>","isa:<schema.org/SportsClub>","isa:<schema.org/StadiumOrArena>","isa:<schema.org/TennisComplex>","isa:<schema.org/Store>","isa:<schema.org/BikeStore>","isa:<schema.org/BookStore>","isa:<schema.org/ClothingStore>","isa:<schema.org/ComputerStore>","isa:<schema.org/ConvenienceStore>","isa:<schema.org/DepartmentStore>","isa:<schema.org/ElectronicsStore>","isa:<schema.org/Florist>","isa:<schema.org/FurnitureStore>","isa:<schema.org/GardenStore>","isa:<schema.org/GroceryStore>","isa:<schema.org/HardwareStore>","isa:<schema.org/HobbyShop>","isa:<schema.org/HomeGoodsStore>","isa:<schema.org/JewelryStore>","isa:<schema.org/LiquorStore>","isa:<schema.org/MensClothingStore>","isa:<schema.org/MobilePhoneStore>","isa:<schema.org/MovieRentalStore>","isa:<schema.org/MusicStore>","isa:<schema.org/OfficeEquipmentStore>","isa:<schema.org/OutletStore>","isa:<schema.org/PawnShop>","isa:<schema.org/PetStore>","isa:<schema.org/ShoeStore>","isa:<schema.org/SportingGoodsStore>","isa:<schema.org/TireShop>","isa:<schema.org/ToyStore>","isa:<schema.org/WholesaleStore>","isa:<schema.org/TelevisionStation>","isa:<schema.org/TouristInformationCenter>","isa:<schema.org/TravelAgency>","isa:<schema.org/MedicalOrganization>","isa:<schema.org/DiagnosticLab>","isa:<schema.org/VeterinaryCare>","isa:<schema.org/NGO>","isa:<schema.org/NewsMediaOrganization>","isa:<schema.org/OnlineBusiness>","isa:<schema.org/OnlineStore>","isa:<schema.org/PerformingGroup>","isa:<schema.org/DanceGroup>","isa:<schema.org/MusicGroup>","isa:<schema.org/TheaterGroup>","isa:<schema.org/PoliticalParty>","isa:<schema.org/Project>","isa:<schema.org/FundingAgency>","isa:<schema.org/ResearchProject>","isa:<schema.org/ResearchOrganization>","isa:<schema.org/SearchRescueOrganization>","isa:<schema.org/SportsOrganization>","isa:<schema.org/SportsTeam>","isa:<schema.org/WorkersUnion>","isa:<schema.org/Person>","isa:<schema.org/Patient>","isa:<schema.org/Place>","isa:<schema.org/Accommodation>","isa:<schema.org/Apartment>","isa:<schema.org/CampingPitch>","isa:<schema.org/House>","isa:<schema.org/SingleFamilyResidence>","isa:<schema.org/Room>","isa:<schema.org/HotelRoom>","isa:<schema.org/MeetingRoom>","isa:<schema.org/Suite>","isa:<schema.org/AdministrativeArea>","isa:<schema.org/City>","isa:<schema.org/Country>","isa:<schema.org/SchoolDistrict>","isa:<schema.org/State>","isa:<schema.org/CivicStructure>","isa:<schema.org/Airport>","isa:<schema.org/Aquarium>","isa:<schema.org/Beach>","isa:<schema.org/BoatTerminal>","isa:<schema.org/Bridge>","isa:<schema.org/BusStation>","isa:<schema.org/BusStop>","isa:<schema.org/Cemetery>","isa:<schema.org/Crematorium>","isa:<schema.org/EventVenue>","isa:<schema.org/GovernmentBuilding>","isa:<schema.org/CityHall>","isa:<schema.org/Courthouse>","isa:<schema.org/DefenceEstablishment>","isa:<schema.org/Embassy>","isa:<schema.org/LegislativeBuilding>","isa:<schema.org/Museum>","isa:<schema.org/MusicVenue>","isa:<schema.org/Park>","isa:<schema.org/ParkingFacility>","isa:<schema.org/PerformingArtsTheater>","isa:<schema.org/PlaceOfWorship>","isa:<schema.org/BuddhistTemple>","isa:<schema.org/Church>","isa:<schema.org/CatholicChurch>","isa:<schema.org/HinduTemple>","isa:<schema.org/Mosque>","isa:<schema.org/Synagogue>","isa:<schema.org/Playground>","isa:<schema.org/PublicToilet>","isa:<schema.org/RVPark>","isa:<schema.org/SubwayStation>","isa:<schema.org/TaxiStand>","isa:<schema.org/TrainStation>","isa:<schema.org/Zoo>","isa:<schema.org/Landform>","isa:<schema.org/BodyOfWater>","isa:<schema.org/Canal>","isa:<schema.org/LakeBodyOfWater>","isa:<schema.org/OceanBodyOfWater>","isa:<schema.org/Pond>","isa:<schema.org/Reservoir>","isa:<schema.org/RiverBodyOfWater>","isa:<schema.org/SeaBodyOfWater>","isa:<schema.org/Waterfall>","isa:<schema.org/Continent>","isa:<schema.org/Mountain>","isa:<schema.org/Volcano>","isa:<schema.org/LandmarksOrHistoricalBuildings>","isa:<schema.org/Residence>","isa:<schema.org/ApartmentComplex>","isa:<schema.org/GatedResidenceCommunity>","isa:<schema.org/TouristAttraction>","isa:<schema.org/TouristDestination>","isa:<schema.org/Product>","isa:<schema.org/IndividualProduct>","isa:<schema.org/ProductGroup>","isa:<schema.org/ProductModel>","isa:<schema.org/SomeProducts>","isa:<schema.org/Vehicle>","isa:<schema.org/BusOrCoach>","isa:<schema.org/Car>","isa:<schema.org/Motorcycle>","isa:<schema.org/MotorizedBicycle>","isa:<schema.org/Taxon>"]

print(type_name_list)



for pred in type_name_list:
    print(pred)
    rowsavant.append(calculate_countavcov("CSET_avant", pred))
    rowsapres.append(calculate_countavcov("CSET_apres", pred))

    #rowsavant.append(Row(type=pred, average=float(calculate_average("CSET_avant", pred)), coverage=float(calculate_coverage("CSET_avant", pred))) )
    #print(rowsavant)

dfavant = spark.createDataFrame(rowsavant)
dfavant.createOrReplaceTempView("avcovavant")
dfavant.show()

dfapres = spark.createDataFrame(rowsapres)
dfapres.createOrReplaceTempView("avcovapres")
dfapres.show()



In [None]:
spark.sql("select * from avcovavant order by average DESC").show(truncate=0)
spark.sql("select * from avcovapres order by average DESC").show(truncate=0)

spark.sql("select * from avcovavant order by coverage DESC").show(truncate=0)
spark.sql("select * from avcovapres order by coverage DESC").show(truncate=0)

In [None]:
covavevolution = spark.sql("""
SELECT
    COALESCE(avcovavant.type, avcovapres.type) AS type,
    avcovavant.count AS count_before,
    avcovapres.count AS count_after,
    CASE
        WHEN avcovavant.type IS NULL THEN "This type is only in the after graph"  -- Type is only in avcovapres
        WHEN avcovapres.type IS NULL THEN "This type is only in the before graph"  -- Type is only in avcovavant
        ELSE (avcovapres.count - avcovavant.count) / avcovavant.count * 100.0
    END AS percentage_count_evolution,
    avcovavant.average AS average_before,
    avcovapres.average AS average_after,
    CASE
        WHEN avcovavant.type IS NULL THEN "This type is only in the after graph"  -- Type is only in avcovapres
        WHEN avcovapres.type IS NULL THEN "This type is only in the before graph"  -- Type is only in avcovavant
        ELSE (avcovapres.average - avcovavant.average) / avcovavant.average * 100.0
    END AS percentage_average_evolution,
    avcovavant.coverage AS coverage_before,
    avcovapres.coverage AS coverage_after,
    CASE
        WHEN avcovavant.type IS NULL THEN "This type is only in the after graph"  -- Type is only in avcovapres
        WHEN avcovapres.type IS NULL THEN "This type is only in the before graph"  -- Type is only in avcovavant
        ELSE (avcovapres.coverage - avcovavant.coverage) / avcovavant.coverage * 100.0
    END AS percentage_coverage_evolution
FROM
    avcovavant
FULL OUTER JOIN
    avcovapres
ON
    avcovavant.type = avcovapres.type
""")
covavevolution.show(truncate=110)
covavevolution.createOrReplaceTempView("covavevolution")

In [None]:
# SAUVEGARDE ICI LES RESULTATS DE COVAVEVOLUTION DANS UN CSV
covavevolution.write.option("header",True).mode("overwrite").csv(f"s3a://test-out/types/covavevolutionresult-fix")

read = spark.read.option("header", True).csv(f"s3a://test-out/types/covavevolutionresult-fix")

read.show(truncate=110)


In [None]:
covavevolutionOrder = spark.sql("""
    SELECT
        *,
        (CAST(percentage_average_evolution AS DOUBLE) / MAX(CAST(percentage_average_evolution AS DOUBLE)) OVER ()) +
        (CAST(percentage_coverage_evolution AS DOUBLE) / MAX(CAST(percentage_coverage_evolution AS DOUBLE)) OVER ()) AS combined_distance
    FROM
        covavevolution
    ORDER BY combined_distance DESC
""")
covavevolutionOrder.show(truncate=110)
covavevolutionOrder.createOrReplaceTempView("covavevolutionOrder")

In [None]:
spark.sql("""
    SELECT type, percentage_average_evolution, percentage_coverage_evolution
    FROM covavevolutionOrder
""").show(truncate=0)