### Reading the libraries: 

In [1]:
import pandas as pd
import numpy as np
import os as os
import yaml
import io
from dotenv import load_dotenv
from pathlib import Path

### Reading the required parameters from the yaml file to set parameters for tuning: 

In [2]:
with open("tuning_columns.yaml", 'r') as stream:
    yaml_info = yaml.safe_load(stream)

In [3]:
results  = pd.read_csv(yaml_info['sample_results'])
confidence_files_dir = yaml_info['confidence_values_folder']
list_files =  os.listdir(confidence_files_dir)
fields_to_tune =  yaml_info['columns']

### Reading environment variables to set parameters for running confidence algo: 

In [11]:
dotenv_path = Path('.env')
load_dotenv(dotenv_path=dotenv_path)

#%%  Declaring all the variables

## locations where the files with confidence scores need to be saved
save_location =  os.getenv('save_location')

## csv file with the SQL code repo
sql_repo_csv_location =  os.getenv('sql_repo_csv_location')

## setting the maximum confidence a source can have: 
maximum_confidence_value = float(os.getenv('maximum_confidence_value'))

## setting  the maximum number of iterations in the algo: 
maximum_number_of_iterations =  float(os.getenv('maximum_number_of_iterations'))

## setting the minimum differnce in the trustworthiness scores to be reached to end the algo
minimum_absolute_difference =  float(os.getenv('minimum_absolute_difference'))

# setting the variables for connecting to the DB:
## hostname 
host = os.getenv('host')

## username
user = os.getenv('user')

## password
password=os.getenv('password')

## DB name
database= os.getenv('database')

## tuned gamma 
tuned_gamma = os.getenv('tuned_gamma')
#%%

## connecting to the DB
#mydb = mysql.connector.connect(host = host, user = user , password= password , database= database )

### Reading data directly :

In [2]:
#data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/datakhatano_36.csv' )
data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataLandArea 40.csv' ,header= None,sep=';', names =['krushk_id','aadhar_no','tehsil_no','village','khata_no','plot_no','field','self','bhulekh','mpas','ppas'])
#data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/datalandholdingarea_39.csv' ,header= None,sep=';', names =['krushk_id','aadhar_no','tehsil_no','village','khata_no','plot_no','field','self','bhulekh'])
#data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/datakhatano_35.csv' ,header= None,sep=';', names =['krushk_id','aadhar_no','tehsil_no','village','khata_no','plot_no','field','self','bhulekh','mpas','ppas'])
#data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/datakharifcrops.csv' )
#data =  pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataGram Panchayat,Ward21.csv')
#data =  pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataVillage22.csv')
#data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataDistrict31.csv')
data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataFarmer Occupation0.csv',header= None,sep=';' ,names = ['krushk_id','aadhar_no','field', 'self', 'mpas', 'ppas', 'bhulekh'])
# data =  pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataActivities3.csv',header= None,sep=';', names =['krushk_id','aadhar_no','field','self','mpas','ppas'])
#data = pd.read_csv('D:/Samagra/KO/gautam_data/test_conf/dataplotno_36.csv',header= None,sep=';' ,names = ['krushk_id','aadhar_no','tehsil_no','village','khata_no','plot_no','field','self','bhulekh','mpas','ppas'])

### Pre-defined functions: 

In [94]:
def carry_out_iterations( data,list_of_cols,t_w,id_colname,gamma): 
    
    """
    Arguments: 
        data : the table serving as input for the confidence algo 
        list_of_cols:  list of column names that serve as unique sources for the data points 
        t_w:  array of initial confidence values (source trustworthiness) for each source. Usually set to 0.5 for each source
        id_colname: Column name with unique id value for each row
        
    Returned values: 
    t_w_df : table with the changing source trustworthiness with each iteration
    train_data_confidence : Final table with the final confidence values for each data point for each source
    
    """
        
    
    max_t_w_value =  0.975
    train_data =  data[list_of_cols].copy()
    
    train_data = train_data.loc[np.sum(~train_data.isna(),axis = 1) > 1,:]
    ## creating empty data frame with same structure as traindata to copy confidence scores 
    
    train_data_confidence =  train_data.copy()
    train_data_confidence.loc[:,:]= 0

    ## calculating (1-t(w)). Carrying out calculation required for the equation
    t_w_inv =  1- t_w
    tau_w =  -np.log(t_w_inv)

    ## creating dataframe that maintains list of confidence values through each iteration
    confidence_iterations = pd.DataFrame(columns =train_data.columns.tolist() + ['iteration'])
    t_w_df = pd.DataFrame(columns = train_data.columns)

    for iteration in range(0,100):

        for col_name in list_of_cols:
            column_matching_df=  train_data_confidence.copy()
            column_matching_df.loc[:,:]= 0
            current_source =  train_data[col_name]

            other_sources_cols = [x for x in list_of_cols if x != current_source.name]

            column_matching_df[col_name] = 1
            for col_name_others in other_sources_cols:
                column_matching_df[col_name_others] = np.where(train_data[col_name_others]==current_source,1,-1)
            column_matching_df[pd.isnull(train_data)]=0

            for col_ii in range(0,column_matching_df.shape[1]):
                column_matching_df.iloc[:,col_ii] = column_matching_df.iloc[:,col_ii] * tau_w[col_ii]

            train_data_confidence[col_name]= np.where(pd.isnull(current_source),np.nan,1/(1 + np.exp( -1 * gamma * ( column_matching_df.sum(axis=1)  ) )))


        ## maintaining record of the trusworthiness scores of websites
        t_w_prev =  t_w.copy()
        t_w_df.loc[iteration]= t_w
        t_w = train_data_confidence.mean()
        t_w [t_w >= max_t_w_value] = max_t_w_value
        t_w_inv =  1- t_w
        tau_w =  -np.log(t_w_inv)

        ## printing itertion number and the trustworthiness score
        print(iteration, np.array(t_w_prev))
        if iteration > 5:
            if np.nansum(np.abs(t_w.values - t_w_prev.values)) < 0.001:
                break
    
    train_data_confidence[id_colname] =  data[id_colname]
    
    return(t_w_df,train_data_confidence )


## Function to get the final confidence values from the source trustworthiness values 
def get_final_confidence(data,list_of_cols, column_to_check_confidence,t_w ,id_colname):
    
    """
    Arguments: 
        data : the table serving as input for the confidence algo 
        list_of_cols:  list of column names that serve as unique sources for the data points 
        column_to_check_confidence:  column for which final confidence needs to be calcuated 
        t_w: trustworthiness score for each source (final values from the iterations)
        id_colname: colum with unqiue id values for each row
        
    Returned values: 
    data  : Table which returns the final confidence scores for the required columns    
    """
        
    train_data =  data[list_of_cols].copy()
    
    column_matching_df =  train_data.copy()
    column_matching_df.loc[:,:]= 0
    
    if (np.isnan(t_w[0])):
        t_w[0] = t_w[1]
    if (np.isnan(t_w[1])):
        t_w[1]= t_w[0]
    
    
    ## calculating (1-t(w)). Carrying out calculation required for the equation
    t_w_inv =  1- t_w
    tau_w =  -np.log(t_w_inv)

    current_source =  data[column_to_check_confidence]

    other_sources_cols = [x for x in list_of_cols ]

    for col_name_others in other_sources_cols:
        column_matching_df[col_name_others] = np.where(train_data[col_name_others]==current_source,1,-1)
    column_matching_df[pd.isnull(train_data)]=0

    for col_ii in range(0,column_matching_df.shape[1]):
        column_matching_df.iloc[:,col_ii] = column_matching_df.iloc[:,col_ii] * tau_w[col_ii]

    final_conf_scores= np.where(pd.isnull(current_source),np.nan,1/(1 + np.exp(-column_matching_df.sum(axis=1))))

    data['final_confidence'] = final_conf_scores

    return(data)

In [None]:
sql_

In [5]:
fields_to_tune

['plot_no', 'khata_no']

In [9]:
pd.read_csv(sql_repo_csv_location)

Unnamed: 0,SrNo,Parent Label,Field Name,SQL Code,Comments,Multiple_confidence_columns,Columns_list,gamma
0,1,Demographic Details,Farmer Occupation,SELECT \n`t_farmer_occupation`.`int_krushk_id`...,Multiple occupation value for same adhar in KO,1,"field,self,mpas,ppas,bhulekh",1
1,2,Demographic Details,Type of Agricultural Labourer,"SELECT DISTINCT `int_krushk_id`,\nCASE WHEN `l...",,0,"field,self",1
2,3,Demographic Details,Type of Crop Cultivator,SELECT DISTINCT t_farmer_cultivator_type.int_...,,1,"field,self,mpas,ppas",1
3,4,Demographic Details,Activities,"SELECT t_farmer_activities.`int_krushk_id`,\nt...",,1,"field,self,mpas,ppas",1
4,5,Demographic Details,Application Number,"SELECT `int_krushk_id`,`vch_aadharno`,`vch_app...",,0,vch_application_no,1
5,6,Demographic Details,Aadhaar No,"SELECT DISTINCT `int_krushk_id`,`vch_aadharno`...",,0,vch_aadharno,1
6,7,Demographic Details,Farmer Name (As per Aadhaar),"SELECT DISTINCT `int_krushk_id`,`vch_farmer_na...",,0,vch_farmer_name,1
7,8,Demographic Details,Gender,"SELECT `int_krushk_id`,\nCASE WHEN `updated_at...",,1,"field,self",1
8,9,Demographic Details,Day of Birth,"SELECT DISTINCT `int_krushk_id`,\nCASE WHEN `u...",,0,"field,Aadhar",1
9,10,Demographic Details,Month of Birth,"SELECT `int_krushk_id`,\nCASE WHEN `updated_at...",,0,"field,Aadhar",1


In [18]:
max_t_w_value =  0.975
train_data =  data[list_of_cols].copy()
train_data = train_data.loc[np.sum(~train_data.isna(),axis = 1) > 3,:]

In [26]:
data = data[['id','krushk_id','field','self','mpas','ppas','bhulekh' ]]

In [27]:
data = data.loc[np.sum(~data.isna(),axis = 1) > 5,:]

In [24]:
data.drop(columns =['aadhar_no'], axis =1 )

Unnamed: 0,krushk_id,field,self,mpas,ppas,bhulekh,id
0,5316600,,1,,,,0
1,5316607,,2,,,,1
2,5316621,,2,,,,2
3,5316631,,1,,,1.0,3
4,5316631,,2,,,1.0,4
...,...,...,...,...,...,...,...
166382,203275,,1,,,,166382
166383,203275,,2,,,,166383
166384,7118355,,1,,,,166384
166385,7118355,,2,,,,166385


In [19]:
train_data

Unnamed: 0,field,self,mpas,ppas,bhulekh
27779,,1,1.0,1.0,1.0
27780,,2,1.0,1.0,1.0
67096,,1,1.0,1.0,1.0
84112,,1,1.0,1.0,1.0
84113,,1,1.0,1.0,1.0
86871,,1,1.0,1.0,1.0
86872,,1,1.0,1.0,1.0
86873,,1,1.0,1.0,1.0
86874,,1,1.0,1.0,1.0
86875,,1,1.0,1.0,1.0


In [4]:
codes_df = pd.read_csv("D:/Samagra/KO/test_conf/sql_code_repo.csv")
table_no = 0
codes_df_run = codes_df.loc[codes_df.Multiple_confidence_columns == 1,: ]
table_name_str  = codes_df.loc[table_no,'Field Name']
table_name_str

'Farmer Occupation'

In [5]:
columns = codes_df.loc[table_no,'Columns_list']
list_of_cols = np.array(columns.split (","))
#data =data.rename(columns ={'Unnamed: 0':'id'})
data['id']= data.index

In [115]:
#list_of_cols= list_of_cols[0:3]
#list_of_cols=['KO', 'ama_krushi', 'mpas', 'ppas']
list_of_cols

array(['field', 'self', 'mpas', 'ppas', 'bhulekh'], dtype='<U7')

In [116]:
no_cols =  len(list_of_cols)
t_w = np.repeat(0.5,no_cols)
id_colname = 'id'

In [124]:
gamma = 0.1

In [125]:
t_w_df,train_data_confidence = carry_out_iterations( data,list_of_cols,t_w,id_colname, gamma)

0 [0.64866292 0.64866292 0.68221822 0.975      0.76453908]
1 [0.52536559        nan 0.52894998 0.61473098 0.53893649]
2 [0.51821675        nan 0.51737305 0.53739416 0.51985314]
3 [0.51785746        nan 0.51662578 0.53217417 0.51867307]
4 [0.51783966        nan 0.51657737 0.53185444 0.51860137]
5 [0.51783879        nan 0.51657422 0.53183499 0.51859702]
6 [0.51783875        nan 0.51657401 0.53183381 0.51859675]


In [126]:
gamma_list =  np.append([0.01, 0.05, 0.08], np.array(list(range(1,11))) )/10

In [127]:
data_original = data.copy()

In [None]:
for gamma in gamma_list:
    t_w_df,train_data_confidence = carry_out_iterations(data,list_of_cols,t_w,id_colname, gamma)
    
    data['Krushak_Odisha'] = data.field.combine_first(data.self)
    column_to_check_confidence = 'Krushak_Odisha'
    #column_to_check_confidence = 'KO'
    t_w = t_w_df.iloc[len(t_w_df)-1,:]

In [128]:
t_w_df.field[len(t_w_df.field)-1]

0.5178387478815145

In [129]:
train_data_confidence

Unnamed: 0,field,self,mpas,ppas,bhulekh,id
69,0.536448,,,,0.536448,69
71,0.536448,,,,0.536448,71
72,0.536448,,,,0.536448,72
73,0.499961,,,,0.500039,73
75,0.499961,,,,0.500039,75
...,...,...,...,...,...,...
2590351,0.536344,,0.536344,,,2590351
2590367,0.536344,,0.536344,,,2590367
2590414,0.536344,,0.536344,,,2590414
2590874,0.536344,,0.536344,,,2590874


In [137]:
data['Krushak_Odisha'] = data.field.combine_first(data.self)

In [138]:
data.Krushak_Odisha.isna().sum()

0

In [139]:
#data['Krushak_Odisha'] = data.field.combine_first(data.self)
column_to_check_confidence = 'Krushak_Odisha'
#column_to_check_confidence = 'KO'
t_w = t_w_df.iloc[len(t_w_df)-1,:]

In [140]:
conf_data =  get_final_confidence(data,list_of_cols, column_to_check_confidence,t_w ,id_colname)

In [133]:
print('shape: ',conf_data.shape)
print('% of data points not na', np.sum(~conf_data['final_confidence'].isna())/conf_data.shape[0])

shape:  (2591612, 14)
% of data points not na 1.0


In [141]:
conf_data.final_confidence.mean()

0.6635669534837265

In [30]:
conf_data

Unnamed: 0,krushk_id,aadhar_no,tehsil_no,village,khata_no,plot_no,field,self,bhulekh,mpas,ppas,id,Krushak_Odisha,final_confidence
0,1,596426739074,2.0,243.0,159,6908,159,,,,,0,159,0.668252
1,1,596426739074,2.0,363.0,195/6,204,195/6,,,,,1,195/6,0.668252
2,1625232,571673805637,,,875,960,875,,,,,2,875,0.668252
3,1625232,571673805637,,,875,1162,875,,,,,3,875,0.668252
4,1625232,571673805637,,,875,3598,875,,,,,4,875,0.668252
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5766749,7111029,931735721570,7.0,53.0,39,426,39,,,,,5766749,39,0.668252
5766750,1061854,491372971771,6.0,155.0,243,371,243,,,,,5766750,243,0.668252
5766751,7112388,764948116054,5.0,331.0,734,3994,734,,,,,5766751,734,0.668252
5766752,2119357,996287034481,2.0,318.0,152,1537,152,,,,152,5766752,152,0.802310


In [135]:
conf_data = conf_data.rename(columns ={'krushk_id':'int_krushk_id'})
conf_data = conf_data.rename(columns ={'KO':'Krushak_Odisha'})

In [136]:
conf_data['index']= conf_data.index
final_conf_value_table = conf_data[['index','int_krushk_id','Krushak_Odisha','final_confidence']]

### Sample results:

In [2]:
sample_farmers2 = pd.read_csv("D:/Samagra/KO/Joining confidence/sample/All_sample_data_joined1.csv")
sample_farmers1 = pd.read_csv("D:/Samagra/KO/Joining confidence/sample/sample_farmers_ground_truthing_all_joined_v1_sample2.csv")
sample_farmers2= sample_farmers2[['int_krushk_id','Primary Mobile No']].drop_duplicates()
sample_farmers1 =  sample_farmers1[['int_krushk_id','Primary Mobile No']].drop_duplicates()
sampled_farmers = pd.concat([sample_farmers1,sample_farmers2], axis = 0)
sampled_farmers['Primary Mobile No'] =  sampled_farmers['Primary Mobile No'].astype('str').str[0:10]
results  = pd.read_csv('D:/Samagra/KO/Joining confidence/sample/KO_conf_results2.csv').rename(columns={'Mobile Number': 'Primary Mobile No'})
results = pd.merge(results,sampled_farmers, how= 'left')



  sample_farmers2 = pd.read_csv("D:/Samagra/KO/Joining confidence/sample/All_sample_data_joined1.csv")


In [4]:
results.to_csv('D:/Samagra/KO/Joining confidence/sample/KO_conf_results_final.csv', index = False )

In [12]:
#conf_directory = 'D:/Samagra/KO/final_confidence/'
matching_table =  pd.read_csv('D:/Samagra/KO/Confidence algo/confidence matching1.csv')

In [13]:
matching_table

Unnamed: 0,field_names,Conf_tables,sample_column,sample_Is_complete,Multi_type,Is_present
0,Activities3,conf_scoreActivities3.csv,Farmer Activity,Is Correct?.1,prod,1.0
1,"Block,NAC,ULB20","conf_scoreBlock,NAC,ULB20.csv",Block,Is Correct?.7,single,1.0
2,District19,conf_scoreDistrict19.csv,District,Is Correct?.6,single,1.0
3,District31,conf_scoreDistrict31.csv,Land District,Is Correct?.10,single,1.0
4,Farmer Area under Cultivation (Acres)40,conf_scoreFarmer Area under Cultivation (Acres...,Area under Cultivation (Acres),Is Correct?.17,match,0.0
5,Farmer Occupation0,conf_scoreFarmer Occupation0.csv,,,,0.0
6,Farmer Type17,conf_scoreFarmer Type17.csv,,,,0.0
7,"Gram Panchayat,Ward21","conf_scoreGram Panchayat,Ward21.csv",GP/Ward,Is Correct?.8,single,1.0
8,kharif_crops3,conf_scorekharif_crops3.csv,Kharif Crops,Correct Kharif Crops,multi,1.0
9,Khata No35,conf_scoreKhata No35.csv,Khata #,Is Correct?.14,single,1.0


In [52]:
matching_table_single  = matching_table.loc[matching_table.Is_present == 1,:]
matching_table_single = matching_table_single.loc[matching_table_single.Multi_type == 'single',:]
matching_table_single = matching_table_single.loc[~pd.isna(matching_table_single.sample_Is_complete),:].reset_index(drop= True)

In [106]:
matching_table_single
i = 5

In [107]:
table_name = matching_table_single['Conf_tables'][i]
sample_file_colname =  matching_table_single['sample_column'][i]
sample_file_iscorrect_col = matching_table_single['sample_Is_complete'][i]
conf_table =  conf_data
if sample_file_colname != 'Caste Category':
    conf_table = conf_table.groupby(['int_krushk_id','Krushak_Odisha']).agg({'final_confidence':'mean'}).reset_index().rename(columns={'Krushak_Odisha': sample_file_colname})
else:  
    conf_table = conf_table.groupby(['int_krushk_id']).agg({'final_confidence':'mean'}).reset_index()

results_col = results[['int_krushk_id',sample_file_colname,sample_file_iscorrect_col]]
results_col[sample_file_colname]= results_col[sample_file_colname].str.lower()
df = pd.merge(conf_table,results_col, how = 'inner' ).rename(columns={sample_file_iscorrect_col: 'IsCorrect'})
df = df.dropna()
df =  df.groupby('final_confidence').agg( no_match = ('IsCorrect','sum'), no_total=('IsCorrect','count')).reset_index()
df['perc_correct'] =  df.no_match/df.no_total
df['frequency']= df['no_total']/sum(df['no_total'])
df['field'] = sample_file_colname

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  results_col[sample_file_colname]= results_col[sample_file_colname].str.lower()


In [60]:
df.loc[df.no_total > 40 ,:]

Unnamed: 0,final_confidence,no_match,no_total,perc_correct,frequency,field
0,0.668252,479.0,2466,0.194242,0.489383,Khata #
1,0.668252,83.0,394,0.21066,0.07819,Khata #
12,0.690591,6.0,43,0.139535,0.008533,Khata #
34,0.735268,11.0,66,0.166667,0.013098,Khata #
35,0.735281,14.0,52,0.269231,0.01032,Khata #
68,0.802284,73.0,752,0.097074,0.149236,Khata #
72,0.80231,41.0,277,0.148014,0.054971,Khata #


In [108]:
all_df_cal = df

In [109]:
all_df_cal['MAPE'] = np.abs(all_df_cal['final_confidence'] - all_df_cal['perc_correct'] )*all_df_cal['frequency']
all_df_cal['Mean_diff'] = all_df_cal['final_confidence'] - all_df_cal['perc_correct']
all_df_cal['Average_correct']  = all_df_cal['perc_correct'] * all_df_cal['frequency']
all_df_cal['Average_confidence']  = all_df_cal['final_confidence'] * all_df_cal['frequency']
all_df_cal = all_df_cal.groupby('field').agg({'MAPE':'sum', 'Average_confidence':'sum','Average_correct':'sum'}).reset_index()

In [110]:
all_df_cal

Unnamed: 0,field,MAPE,Average_confidence,Average_correct
0,Plot #,0.688204,0.716605,0.028401


In [47]:
gamma_str = '_gamma_'

In [729]:
#final_conf_value_table.to_csv('D:/Samagra/KO/final_confidence/conf_score' + 'kharif_crops'+str(table_no)+'.csv', encoding = "utf-8",index = False)
final_conf_value_table.to_csv('D:/Samagra/KO/final_confidence/conf_score' + gamma_str + table_name_str+str(table_no)+'.csv', encoding = "utf-8",index = False)

In [44]:
table_no

35

In [45]:
table_name_str

'Khata No'

In [9]:
def get_final_confidence(data,list_of_cols, column_to_check_confidence,t_w ,id_colname):
    
    """
    Arguments: 
        data : the table serving as input for the confidence algo 
        list_of_cols:  list of column names that serve as unique sources for the data points 
        column_to_check_confidence:  column for which final confidence needs to be calcuated 
        t_w: trustworthiness score for each source (final values from the iterations)
        id_colname: colum with unqiue id values for each row
        
    Returned values: 
    data  : Table which returns the final confidence scores for the required columns    
    """
    
    
    
    train_data =  data[list_of_cols].copy()
    
    column_matching_df =  train_data.copy()
    column_matching_df.loc[:,:]= 0
    
    if (np.isnan(t_w[0])):
        t_w[0] = t_w[1]
    if (np.isnan(t_w[1])):
        t_w[1]= t_w[0]
    
    
    ## calculating (1-t(w)). Carrying out calculation required for the equation
    t_w_inv =  1- t_w
    tau_w =  -np.log(t_w_inv)

    current_source =  data[column_to_check_confidence]

    other_sources_cols = [x for x in list_of_cols ]

    for col_name_others in other_sources_cols:
        column_matching_df[col_name_others] = np.where(train_data[col_name_others]==current_source,1,-1)
    column_matching_df[pd.isnull(train_data)]=0

    for col_ii in range(0,column_matching_df.shape[1]):
        column_matching_df.iloc[:,col_ii] = column_matching_df.iloc[:,col_ii] * tau_w[col_ii]

    final_conf_scores= np.where(pd.isnull(current_source),np.nan,1/(1 + np.exp(-column_matching_df.sum(axis=1))))

    data['final_confidence'] = final_conf_scores

    return(data)

In [12]:
gamma = 1

In [13]:
def carry_out_iterations( data,list_of_cols,t_w,id_colname,gamma): 
    
    """
    Arguments: 
        data : the table serving as input for the confidence algo 
        list_of_cols:  list of column names that serve as unqiue sources for the data points 
        t_w:  array of initial confidence values (source trustworthiness) for each source. Usually set to 0.5 for each source
        id_colname: Column name with unique id value for each row
        
    Returned values: 
    t_w_df : table with the changing source trustworthiness with each iteration
    train_data_confidence : Final table with the final confidence values for each data point for each source
    
    """
        
    
    max_t_w_value =  0.975
    train_data =  data[list_of_cols].copy()
    
    train_data = train_data.loc[np.sum(~train_data.isna(),axis = 1) > 1,:]
    ## creating empty data frame with same structure as traindata to copy confidence scores 
    
    train_data_confidence =  train_data.copy()
    train_data_confidence.loc[:,:]= 0

    ## calculating (1-t(w)). Carrying out calculation required for the equation
    t_w_inv =  1- t_w
    tau_w =  -np.log(t_w_inv)

    ## creating dataframe that maintains list of confidence values through each iteration
    confidence_iterations = pd.DataFrame(columns =train_data.columns.tolist() + ['iteration'])
    t_w_df = pd.DataFrame(columns = train_data.columns)

    for iteration in range(0,100):

        for col_name in list_of_cols:
            column_matching_df=  train_data_confidence.copy()
            column_matching_df.loc[:,:]= 0
            current_source =  train_data[col_name]

            other_sources_cols = [x for x in list_of_cols if x != current_source.name]

            column_matching_df[col_name] = 1
            for col_name_others in other_sources_cols:
                column_matching_df[col_name_others] = np.where(train_data[col_name_others]==current_source,1,-1)
            column_matching_df[pd.isnull(train_data)]=0

            for col_ii in range(0,column_matching_df.shape[1]):
                column_matching_df.iloc[:,col_ii] = column_matching_df.iloc[:,col_ii] * tau_w[col_ii]

            train_data_confidence[col_name]= np.where(pd.isnull(current_source),np.nan,1/(1 + np.exp( -1 * gamma * ( column_matching_df.sum(axis=1)  ) )))


        ## maintaining record of the trusworthiness scores of websites
        t_w_prev =  t_w.copy()
        t_w_df.loc[iteration]= t_w
        t_w = train_data_confidence.mean()
        t_w [t_w >= max_t_w_value] = max_t_w_value
        t_w_inv =  1- t_w
        tau_w =  -np.log(t_w_inv)

        ## printing itertion number and the trustworthiness score
        print(iteration, np.array(t_w_prev))
        if iteration > 5:
            if np.nansum(np.abs(t_w.values - t_w_prev.values)) < 0.001:
                break
    
    train_data_confidence[id_colname] =  data[id_colname]
    
    return(t_w_df,train_data_confidence )

In [36]:
train_data =  data[list_of_cols].copy()
    
#train_data = train_data.loc[np.sum(~train_data.isna(),axis = 1) > 1,:]
## creating empty data frame with same structure as traindata to copy confidence scores 


In [37]:
train_data1 = train_data.loc[np.sum(~train_data.isna(),axis = 1) > 1,:]

In [39]:
train_data1.shape

(1691767, 5)

In [548]:
tau_w

field         NaN
self     0.235837
mpas     3.688879
ppas     3.688879
Name: 6, dtype: float64

In [544]:
max_t_w_value =  0.975
train_data =  data[list_of_cols].copy()

train_data = train_data.loc[np.sum(~train_data.isna(),axis = 1) > 1,:]
## creating empty data frame with same structure as traindata to copy confidence scores 

train_data_confidence =  train_data.copy()
train_data_confidence.loc[:,:]= 0

## calculating (1-t(w)). Carrying out calculation required for the equation
t_w_inv =  1- t_w
tau_w =  -np.log(t_w_inv)

## creating dataframe that maintains list of confidence values through each iteration
confidence_iterations = pd.DataFrame(columns =train_data.columns.tolist() + ['iteration'])
t_w_df = pd.DataFrame(columns = train_data.columns)

In [546]:
for col_name in list_of_cols:
    column_matching_df=  train_data_confidence.copy()
    column_matching_df.loc[:,:]= 0
    current_source =  train_data[col_name]

    other_sources_cols = [x for x in list_of_cols if x != current_source.name]

    column_matching_df[col_name] = 1
    for col_name_others in other_sources_cols:
        column_matching_df[col_name_others] = np.where(train_data[col_name_others]==current_source,1,-1)
    column_matching_df[pd.isnull(train_data)]=0

    for col_ii in range(0,column_matching_df.shape[1]):
        column_matching_df.iloc[:,col_ii] = column_matching_df.iloc[:,col_ii] * tau_w[col_ii]

    train_data_confidence[col_name]= np.where(pd.isnull(current_source),np.nan,1/(1 + np.exp(-column_matching_df.sum(axis=1))))
