In [21]:
import pandas as pd
from scipy.stats import pearsonr
from sklearn.metrics import jaccard_score

In [22]:
def pearson_compare(df1 , df2 , feature = 'Age'):

    df1_values = df1[feature].values
    df2_values = df2[feature].values
    
    # Pearson korelasyon katsayısı hesaplama
    correlation, _ = pearsonr(df1_values, df2_values)
    
    return correlation
    
  
def jaccard_compare(df1 , df2 , feature = 'Age'):

    df1_values = df1[feature].values
    df2_values = df2[feature].values
    
    # Jaccard benzerlik katsayısı hesaplama
    jaccard_similarity = jaccard_score(df1_values, df2_values , average='weighted')
    
    return jaccard_similarity


In [31]:
def find_identical_rows(df1, df2):
    merged = df1.merge(df2, on="id", suffixes=("_df1", "_df2"))
    identical_rows = (merged.loc[merged["Age_df1"] == merged["Age_df2"], "id"]).values
    return identical_rows

def find_different_rows(df1, df2):
    merged = df1.merge(df2, on="id", suffixes=("_df1", "_df2"))
    different_rows = (merged.loc[merged["Age_df1"] != merged["Age_df2"], "id"]).values
    return different_rows

In [40]:
def compare_age_values(df1, df2, different_ids):
    df1_selected = df1.loc[different_ids].reset_index(drop=True)
    df2_selected = df2.loc[different_ids].reset_index(drop=True)
    
    compared_data = pd.concat([df1_selected["Age"], df2_selected["Age"]], axis=1)
    compared_data.columns = ["Age_df1", "Age_df2"]
    
    return compared_data

In [42]:
def compare_age_values2(df1, df2, different_ids):
    common_ids = set(df1["id"]).intersection(set(df2["id"]))
    valid_ids = [idx for idx in different_ids if idx in common_ids]
    
    if len(valid_ids) == 0:
        print("No valid ids found.")
        return None
    
    df1_selected = df1[df1["id"].isin(valid_ids)].reset_index(drop=True)
    df2_selected = df2[df2["id"].isin(valid_ids)].reset_index(drop=True)
    
    compared_data = pd.concat([df1_selected["Age"], df2_selected["Age"]], axis=1)
    compared_data.columns = ["Age_df1", "Age_df2"]
    
    return compared_data

In [76]:
df1 = pd.read_csv(r'preds\submitted\EnsembleNotebook2_1.3387.csv')
# df2 = pd.read_csv(r'C:\Users\Emincan\Desktop\Playground\submission_mattop_nocat_xgb_lgb_hgb_pca_dropped.csv')
df3 = pd.read_csv(r'C:\Users\Emincan\Desktop\Playground\MultiSubs\3.try\EnsembleSubs_V5.csv' , usecols= ['id' , 'Ensemble_PP']).rename(columns={"Ensemble_PP" : "Age"})
# df3 = pd.read_csv(r'EnsembleSubs_V5.csv')

In [81]:
correlation = pearson_compare(df1,df3)
jaccard = jaccard_compare(df1,df3)

print(f"Pearson Korelasyon Katsayısı: {correlation}")
print(f"Jaccard Benzerlik Katsayısı: {jaccard}")  

Pearson Korelasyon Katsayısı: 0.9993016994345755
Jaccard Benzerlik Katsayısı: 0.9850741462007491


In [82]:
identical_ids = find_identical_rows(df1, df3)
different_ids = find_different_rows(df1, df3)

print("Identical IDs:")
print(identical_ids)

print("Different IDs:")
print(different_ids)

Identical IDs:
[ 74051  74052  74053 ... 123416 123417 123418]
Different IDs:
[ 74116  74138  74202  74217  74413  74418  74627  74967  75086  75435
  75604  75722  75921  76070  76203  76295  76553  76557  76839  76952
  77437  77512  77693  78220  78246  78560  78904  78927  79006  79024
  79168  79473  79519  79530  79538  79546  79565  79680  79790  79801
  80156  80362  80401  80526  80718  80807  80821  81061  81241  81424
  81551  81613  81618  81923  81934  82035  82064  82071  82139  82209
  82238  82416  82790  82871  83028  83153  83327  83408  83412  83651
  83748  83791  83861  84092  84213  84457  84467  84498  84520  84577
  84771  84989  85146  85177  85298  85326  85463  85572  85764  86087
  86109  86261  86375  86420  86592  86830  86965  87052  87101  87279
  87382  87396  87454  88004  88036  88172  88245  88520  88522  88941
  89024  89064  89127  89264  89266  89662  89691  89725  89726  90070
  90160  90408  90446  90657  90924  90991  91002  91181  91193  91330

In [79]:
print(f'Tamamen aynı index sayısı : {len(identical_ids)}')
print(f'Farklı index sayısı : {len(different_ids)}')

Tamamen aynı index sayısı : 48996
Farklı index sayısı : 372


In [80]:
different_ids = find_different_rows(df1, df3)

compared_data = compare_age_values2(df1, df3, different_ids)

print("Compared Age Values:")
compared_data[0:50]

Compared Age Values:


Unnamed: 0,Age_df1,Age_df2
0,10,11
1,17,16
2,11,12
3,17,16
4,10,9
5,9,8
6,9,10
7,11,12
8,9,8
9,9,8


In [74]:
df3

Unnamed: 0,id,Age
0,74051,7
1,74052,8
2,74053,10
3,74054,9
4,74055,7
...,...,...
49363,123414,9
49364,123415,8
49365,123416,13
49366,123417,9


In [75]:
df3.to_csv('EnsembleNotebook2_3.csv', index = False)

In [None]:
# Done.