<a href="https://colab.research.google.com/github/ExCaLBBR/ExCaLBBR_Projects/blob/main/SocioenvironmentalGeometry/code/ConceptGeo_AnalysisNotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Concept Geometry analysis pipeline 
Created by: <b>Roberto Vargas </b><br>
Adapted from Octave by: <b>Nahom Mossazghi </b><br>
<br>
<b>Pipeline includes:</b><br>
*   Data restructuring: Sorting accoring to word list
*   Regression predicting pair differences


<br>
<br>

In [1]:
#@title Install dependancies
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
from scipy import stats 
from itertools import combinations
import math
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
import csv

In [2]:
#@title Define utility functions
def weightedHeatmap(PairData, words, PlotHM):
        
    '''
    Restructure RT into matrix data structure
    
    '''
    
    Wmat = np.zeros((len(words),len(words))) 
    t = 0
    z = 0
    
    for i in range(len(words)):
        for j in range(len(words)):
            
            if i == j:
                Wmat[i,j] = np.NaN
            
            elif j > i:
                Wmat[i,j] = PairData[t]
                Wmat[j,i] = PairData[t]            
                t += 1  
                
                
    if PlotHM == 1:
        plt.imshow(Wmat, cmap='RdBu')
        plt.colorbar()
        plt.xticks(range(len(words)), words, rotation='vertical')
        plt.yticks(range(len(words)), words)
        plt.show()
        
    return Wmat
        
def ccbi_randperm(ntimes, nperm):
    
    '''
      p = ccbi_randperm(nitems,nperm)
      Parameters: number of items, number of random permutations
      Output: a matrix with nperm rows;
      Each row is an index of permuted item positions.
    
      returns a matrix (n,nitems)
      each row is a random permutation of nitems (labelled 1:nitems)
      produces n such permutations
      the random seed is changed at every call
      
    '''
    
    p = np.zeros((nperm, ntimes))        
    for i in range(nperm):
        p[i,:] = np.random.permutation(ntimes)
        
    return p
                 

def splitHalf_Reliability(dat, perm):
    """
    Compute the reliability within a measure
    This analysis splits the data into 2 halfs and then averages the similarity structure
    This analysis is repeated
    
    """
    pSplit = ccbi_randperm(dat.shape[1], perm)

    rho = []
    for p in range(perm):
        # Split data
        if dat.shape[1] % 2 == 0:  # is even
            frstHalf = dat.iloc[:, pSplit[p, :(pSplit.shape[1]//2)]]
            scndHalf = dat.iloc[:, pSplit[p, (pSplit.shape[1]//2):]]

        elif dat.shape[1] % 2 == 1:  # is odd
            frstHalf = dat.iloc[:, pSplit[p, :int(np.floor(pSplit.shape[1]/2))]]
            scndHalf = dat.iloc[:, pSplit[p, int(np.ceil(pSplit.shape[1]/2)):]]
        # Avg dist of each half
        avgFrstHalf = frstHalf.mean(axis=1)
        avgScndHalf = scndHalf.mean(axis=1)

        # Correlate halves
        rhoI = np.corrcoef(avgFrstHalf, avgScndHalf)[0,1]

        rho.append(rhoI)
    
    rho = np.mean(rho)

    return rho
        


def regPairDiff(dumX, cov, Y, perm):
    
    '''
    
    Predict pair differences among a binary category in Y
 	dumX is a dummy code variable being used to predict Y
 	cov is a matrix of covariates included in the model
 	Y is a continuous vector 
 	perm is the number of permutations used to compare against the observed beta
    Dimensions of duX, cov, and Y should all align
    
    '''    
    
    # Generate the permutations
    pComb = ccbi_randperm(len(dumX),perm)
    
    # Generate constant
    # con = np.ones(len(dumX),1)
    
    # Estimate observed beta
    dumX =  dumX.reshape((dumX.shape[0],1))
    xModel = np.concatenate((dumX, cov), axis=1)
    betaObs = LinearRegression().fit(Y, xModel)
    betaObs = betaObs.coef_[1]
    
    betaPerm = []
    
    for p in range(perm):
        pCombi = pComb.astype(int)
        xPermModel = np.concatenate((dumX[pComb[p,:]], cov), axis=1)
        bPerm = LinearRegression().fit(Y,xPermModel)
        betaPerm.append(bPerm.coef_[1])
        
    if betaObs > 0:
        
        nBbeyond = len(np.where((betaPerm>betaObs)))
        pval = nBbeyond/perm
 	    
        
    elif betaObs < 0:
 	    nBbeyond = len(np.where(betaPerm<betaObs));
 	    pval = nBbeyond/perm
        
    else:
        raise ValueError('observed beta is exactly equal to 0')
        
    return betaObs, pval



In [3]:
#Data path
path = 'https://github.com/ExCaLBBR/ExCaLBBR_Projects/raw/main/SocioenvironmentalGeometry/data/'

# Load Demographic PRaM data
datDemo = pd.read_csv (path + 'demographic.csv', header=1)
datPRaM = pd.read_csv (path + 'df_taskPRaM.csv', header=0)
pairLab = pd.read_csv (path + 'PRaM_pairLabels.csv', header=None)

#remove misc row
datDemo = datDemo.drop(labels=0, axis=0)

#Sort pairs accoring to prefered combination
words = ['police', 'firefighter', 'neighbors(yours)', 'conservatives(political)', 'liberals(political)', 'healthcare', 'voting', 'immigration', 'religion', 'science', 'anger', 'fear', 'joy', 'love', 'sadness', 'trust']
combinations_list = [list(c) for c in combinations(words, 2)]


In [4]:
# re-organize the dataframe according to the word combination
wPairLabel = []
x = []                  #index orders are saved as list
for i in range(len(combinations_list)):
    
    # Extract pair rating
    idx0 = pairLab[pairLab.iloc[:,1].str.contains(combinations_list[i][0].replace('(', r'\(').replace(')', r'\)'))].index
    idx1 = pairLab[pairLab.iloc[:,2].str.contains(combinations_list[i][1].replace('(', r'\(').replace(')', r'\)'))].index
    pIndx = idx0.intersection(idx1)
    
    # Extract pair label
    wPairLabeli = combinations_list[i][0] + '-' + combinations_list[i][1]
    wPairLabel.append(wPairLabeli)
    x.append(pIndx[0])

# Pair label rating    
datPRaM_conSort = datPRaM.iloc[:,x]   
datPRaM_conSort = datPRaM_conSort.rename(columns=dict(zip(datPRaM_conSort.columns, wPairLabel)))    
wPairLabel = pd.DataFrame(wPairLabel)    

In [5]:
# Convert to distance matrix
Wmat = np.zeros((len(words), len(words)))    

t = 0
z = 0
for i in range(len(words)):
    for j in range(len(words)):   
        if i == j:
            Wmat[i,j] = np.NaN
        elif j > i:
            Wmat[i,j] = datPRaM_conSort.iloc[t,0]
            Wmat[j,i] = datPRaM_conSort.iloc[t,0]
        
            t += 1

In [41]:
datDemo.drop(labels=0, axis=0)

Unnamed: 0,Question Key,ConnectID,EngFluent,Income,WorkID,age,famIncome,gender,maritalStatus,maritalStatus-text,...,occupation,parentDegree,parentMarriage,parentMarriage-text,partDegree,raceEth_v2,response-3,response-4,zipborn,zipnow
1,7814298.0,,Yes,$12501-$22500,,23.0,$56000-$78000,female,Single,,...,Tester,AA,Never married,,AA,White,43040,3,43040,43000.0
2,7814299.0,,Yes,$42501-$80000,,25.0,$56000-$78000,Female,Single,,...,sales associate,High school diploma,Married,,High School diploma,White,21501,3,21501,21501.0
3,7814300.0,,Yes,$12501-$22500,,40.0,$56000-$78000,female,Married,,...,Administrative assistant,GED/Some college,Divorced,,high school diploma,White,37214,4,37086,37086.0
4,7814301.0,,Yes,Less than $12500,,38.0,$56000-$78000,female,Single,,...,admin,diploma,Never married,,bachelors,White,11706,4,11706,11706.0
5,7814302.0,,Yes,Less than $12500,,31.0,More than $78000,female,Single,,...,data entry,masters,Married,,bachelors,White,20854,6,20854,97202.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582,8526167.0,7078BB7021844B8F80161C1E64B7DFFB,Yes,$42501-$80000,,32.0,Don't know,Male,Married,,...,Store manager,Bachelor degree,Married,,Masters' degree,Black or African American,11230,5,11230,11230.0
583,8526297.0,77303F1E22AB41A8959AB6A6BA9B0D9B,Yes,$42501-$80000,,32.0,Don't know,Male,Married,,...,Sales Consultant,Bachelor degree,Married,,Masters degree,Black or African American,92831,5,92831,92831.0
584,8526356.0,8D07D910DC7749449209F48195EFBE01,Yes,$42501-$80000,,28.0,Don't know,Man,Married,,...,Inventory Manager,Bachelor degree,Married,,Masters degree,Black or African American,34953,6,34953,34953.0
585,8526623.0,2D8B4BE984C34895A1F5142BB295CD23,Yes,$42501-$80000,,31.0,Don't know,male,Married,,...,Sales Representative,bachelor degree,Married,,masters degree,Black or African American,33411,6,33411,33411.0


In [31]:
#Indx racial groups 
bIndx = datDemo['raceEth_v2'].str.contains('Black or African American', na=True)
wIndx = datDemo['raceEth_v2'].str.contains('White', na=True)

bPRaM = datPRaM_conSort[bIndx]
wPRaM = datPRaM_conSort[wIndx]

  bPRaM = datPRaM_conSort[bIndx]
  wPRaM = datPRaM_conSort[wIndx]


In [28]:
bIndx


0       True
1      False
2      False
3      False
4      False
       ...  
582     True
583     True
584     True
585     True
586     True
Name: raceEth_v2, Length: 587, dtype: bool

In [21]:
#Indx racial groups 
raceLab = datDemo['raceEth_v2'].str.contains('Black or African American').index
bIndx =  raceLab[raceLab.iloc[:,0].str.contains('Black or African American')].index
wIndx =  raceLab[raceLab.iloc[:,0].str.contains('White')].index 
    
bPRaM = datPRaM_conSort.iloc[bIndx,:]
wPRaM = datPRaM_conSort.iloc[wIndx,:]

TypeError: ignored

In [15]:
Wmat[0]

array([nan,  2.,  7., -4.,  7.,  6., -4.,  5.,  4.,  5.,  4.,  3.,  7.,
        6.,  4.,  6.])