In [1]:
import pandas as pd
import re 

from scipy.stats import median_test
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
        
alpha = 0.05

import warnings
warnings.filterwarnings('ignore')

In [2]:
def print_result(p):
    if p > alpha:
        print("p-val:",p,'Same distributions (fail to reject H0)')
    else:
        print("p-val:",p,'Different distributions (reject H0)')
        
def check_hypo(df1, col1 , df2, col2):
    print("\n From Median Test:")
    stat, p, med, tbl = median_test(df1[col1], df2[col2])
    print_result(p)
    print("\n From Mann-Whitney U Test:")
    stat, p = mannwhitneyu(df1[col1], df2[col2])
    print_result(p)
    print("\n From Kruskal-Wallis H Test")
    stat, p = kruskal(df1[col1], df2[col2])
    print_result(p)

In [3]:
doc_pat_data = pd.read_csv("Final_dataset.csv")

# Doctors with high popularity receive better grades based on:
## Doctors who have more recommendations from colleagues

In [4]:
check_hypo(doc_pat_data,"grade_x",doc_pat_data,"colleagueRecommendations")


 From Median Test:
p-val: 0.0 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 0.0 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 1.0 Same distributions (fail to reject H0)


## Doctors with more recommendations from patients

In [5]:
doc_pat_data["recommendations"].isna().sum()

recomm_vs_grade = doc_pat_data[doc_pat_data["recommendations"].notna()]
recomm_vs_grade["recommend_percent"] = recomm_vs_grade["recommendations"].apply(lambda x: int(x[:-1]))

check_hypo(recomm_vs_grade, "grade_x",recomm_vs_grade,"recommend_percent" )


 From Median Test:
p-val: 0.0 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 0.0 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 0.0 Different distributions (reject H0)


## Doctors with more views on their profile

In [6]:
check_hypo(doc_pat_data, "grade_x",doc_pat_data,"views")


 From Median Test:
p-val: 0.0 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 0.0 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 1.0 Same distributions (fail to reject H0)


## Doctors who have more reviews

In [7]:
check_hypo(doc_pat_data,"grade_x", doc_pat_data,"reviewsCount")


 From Median Test:
p-val: 0.0 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 0.0 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 1.0 Same distributions (fail to reject H0)


## Doctors with a greater number of likes on their profile

In [8]:
# doc_pat_data["likes"].isna().sum() #considering only profiles available with likes! filtering kssing values  

profiles_with_likes = doc_pat_data[doc_pat_data["likes"].notna()] 
check_hypo(profiles_with_likes,"grade_x", profiles_with_likes,"likes")


 From Median Test:
p-val: 0.0 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 0.0 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 0.0 Different distributions (reject H0)


# Female doctors have better grades than male doctors.

In [9]:
male_gen_phy = doc_pat_data[doc_pat_data["gender"].isin(['MALE'])]
female_gen_phy = doc_pat_data[doc_pat_data["gender"].isin(['FEMALE'])]

In [10]:
check_hypo(male_gen_phy,"grade_x", female_gen_phy,"grade_x")


 From Median Test:
p-val: 8.746403126515193e-70 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 4.375394878891139e-37 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 1.0 Same distributions (fail to reject H0)


# Doctors with higher qualifications get better grades.

In [11]:
doc_pat_data["name_title"].unique()

array([nan, 'Dr. med.', 'Dipl.-Med.', 'Prof. Dr. med.', 'Dr.',
       'MR Dr. med.', 'OMR Dr. med.', 'Dr.h.c.', 'Dr. Dr. med.',
       'Dr. rer.nat.', 'Dr. Dr.', 'Dipl.-Psych.', 'Prof. Dr. Dr. med.',
       'Dr. med. M.Sc.', 'Dr. med. habil.', 'Dr. phil.nat.',
       'Priv.-Doz. Dr. med.', 'MD Dr. med.', 'M.Sc.', 'Prof.',
       'Dr. Dr. phil.', 'Dr. phil.', 'Priv.-Doz. Dr. Dr.'], dtype=object)

In [13]:
# doc_pat_data["name_title"].isna().sum()

dr_or_not = doc_pat_data[doc_pat_data["name_title"].notna()]
# qualified_doc = dr_or_not[dr_or_not["name_title"].str.match('Dr.')]
dr_or_not["high_qual"] = dr_or_not["name_title"].apply(lambda x: 1 if 'Dr.' in x else 0)

In [14]:
check_hypo(dr_or_not,"grade_x", dr_or_not,"high_qual")


 From Median Test:
p-val: 0.0 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 0.0 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 1.0 Same distributions (fail to reject H0)


# Doctors who provide good services receive better grades based on:
## Doctors who give an online appointment


In [17]:
data = pd.read_csv("Dataset_1_selected_columns.csv")

graded_docs = data[data["grade"].notna() & data["otvCustomerType"].notna()]

check_hypo(graded_docs,"grade", graded_docs,"otvCustomerType")


 From Median Test:


TypeError: '<' not supported between instances of 'str' and 'float'

158183

## Doctors who give consultations in multiple locations

# The older patient population give better ratings than their younger group.

In [19]:
old_patients = doc_pat_data[doc_pat_data["age"]  == "über 50"]
young_patients = doc_pat_data[doc_pat_data["age"]  == "unter 30"]

In [20]:
check_hypo(old_patients,"grade_y", young_patients,"grade_y")



 From Median Test:
p-val: 2.059593723750407e-140 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 2.793609948773142e-178 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 1.0 Same distributions (fail to reject H0)


# Patients covered by private health insurance give more favorable evaluations than patients covered by statutory health insurance

In [30]:
# doc_pat_data["insurance"].unique()

private_ins_patients = doc_pat_data[doc_pat_data["insurance"].isin(['PRIVATE'])]
statutory_ins_patients = doc_pat_data[doc_pat_data["insurance"].isin(['STATUTORY'])]

array([nan, 'STATUTORY', 'PRIVATE'], dtype=object)

In [33]:
check_hypo(private_ins_patients,"grade_y", statutory_ins_patients,"grade_y")


 From Median Test:
p-val: 1.509362288102909e-37 Different distributions (reject H0)

 From Mann-Whitney U Test:
p-val: 3.801659888438282e-47 Different distributions (reject H0)

 From Kruskal-Wallis H Test
p-val: 0.0 Different distributions (reject H0)
