# Dorm Study: SNS2 Clean-up
## Karina Lopez
## 08/14/2020
## This script will clean the second-wave social network survey data

In [None]:
import pandas as pd
import numpy as np
import glob
import os
import csv

#setting pandas display options
pd.set_option('display.max_columns', None)  # or 1000
pd.set_option('display.max_rows', None)  # or 1000
pd.set_option('display.max_colwidth', -1)  # or 199
pd.set_option('display.precision',20)

BASE_DIR = "/Users/karina/Box Sync/Cleaningup_SNS2/"

## Helping functions

In [None]:
# Function filters a dataframe based on a filter dataset that contains string ids and a boolean True
def filter_this(filter_df, to_filter_df, rem_col_name='remove'):
    
    new_df = pd.merge(to_filter_df, 
             filter_df, 
             how='outer', on='id')

    # Fill NAs in new column w/ False
    new_df = new_df.fillna(value=0)

    new_df = new_df.loc[new_df[rem_col_name] == False]
    del new_df[rem_col_name]
    
    return(new_df)

In [None]:
# function to identify empty cells

def where_empty(df, p_df = False, p_list = False):
    is_NaN = df.isnull()
    row_has_NaN = is_NaN.any(axis=1)
    rows_with_NaN = df[row_has_NaN]
    
    list_missing_egos = rows_with_NaN["ego"].values.tolist()
    
    if p_df:
        print(rows_with_NaN)
    
    if p_list:
        print(list_missing_egos)
    
    return(list_missing_egos)

In [None]:
def edgelist_creator(network_wing_df):

    #include section that create list of all_friend_headers
    all_friend_headers = list(network_wing_df.columns)
    all_friend_headers  = all_friend_headers[6:]
    
    wing_edgelist_dorm_df = pd.DataFrame()
    final_wing_dorm_df = pd.DataFrame()

    #for loop for each dorm header
    for friend_column in all_friend_headers:
        wing_edgelist_dorm_df = pd.DataFrame()
        list_connections = []

        #for loop for each ego and alter for that column
        for column in network_wing_df[['ego', str(friend_column)]]:
            #create a series object
            columnSeriesObj = network_wing_df[column]       

            #add alter and ego series to dataframe
            wing_edgelist_dorm_df = wing_edgelist_dorm_df.append(columnSeriesObj)

        #create column holding repeating dorm/outside
        list_connections.append(friend_column[0:22])
        list_connections = list_connections * wing_edgelist_dorm_df.shape[1]
        wing_edgelist_dorm_df.loc[len(wing_edgelist_dorm_df), :] = list_connections

        #transpose dataframe and add to final edgelist
        wing_edgelist_dorm_df = wing_edgelist_dorm_df.transpose()
        wing_edgelist_dorm_df.columns = ['ego', 'alter', 'connection']
        final_wing_dorm_df = final_wing_dorm_df.append(wing_edgelist_dorm_df)


    # drop all connections with NA in alter and replace outs with outside
    final_wing_dorm_df = final_wing_dorm_df[final_wing_dorm_df != str(0)].dropna()

    # reset the index
    final_wing_dorm_df = final_wing_dorm_df.reset_index(drop = True)
    
    # lowercase everything and remove trailing zeros
    final_wing_dorm_df['alter'] = final_wing_dorm_df['alter'].str.lower()
    final_wing_dorm_df['alter']= final_wing_dorm_df['alter'].str.strip()
    
    return(final_wing_dorm_df)


In [None]:
# returns a dataframe containing ego, alter, alter_id, connection, id
def alter_anonymizer_df(df):
    
    df = df[['ego', 'alter', 'connection', 'dorm_wing_x', 'id']].copy()

    #split the string by space
    new_data = df["alter"].str.split(" ", n = 1, expand = True)
    df["alter_first"] = new_data[0]
    df["alter_last"] = new_data[1]

    #create new alter id (first name last name initial)
    new_data = df["alter_first"] + " " + df["alter_last"].str[0]
    df['alter_id'] = new_data

    #drop old columns
    df = df.drop(columns=['alter_first', 'alter_last'])

    return(df)

In [None]:
# returns a dataframe containing ego, alter, alter_id, connection

def alter_df_func(df, name_col = 'name', alter_col = 'alter', alter_id_col = 'alter_id', connection_col = 'connection', drop_extra = True):

    alter_df = df[['ego', name_col]].copy()

    # lowercase the names
    df[name_col] = df[name_col].str.lower()

    # separate the alter name by space and create new column with first name
    new_data = df[name_col].str.split(' ', n = 1, expand = True)
    alter_df['alter_first'] = new_data[0]

    # create column for alter connection
    alter_df[connection_col] = new_data[1].str[-4:]
    # rename the connections
    alter_df[connection_col] = alter_df[connection_col].map({'-2n)': 'outside', '-2s)': 'outside', '(2s)': 'dorm', '(2n)':'dorm'})

    # create column for alter last name
    alter_df.loc[(alter_df[connection_col] == 'dorm'), 'alter_last'] = new_data[1].str[:-5]
    alter_df.loc[(alter_df[connection_col] == 'outside'), 'alter_last'] = new_data[1].str[:-9]

    # join first and last names
    alter_df[alter_col] = alter_df["alter_first"] + " " + alter_df["alter_last"]

    # create column for alter_id
    alter_df[alter_id_col] = alter_df["alter_first"] + " " + alter_df["alter_last"].str[0]

    if drop_extra:
        alter_df = alter_df.drop(columns=['alter_first', 'alter_last', name_col])

    return(alter_df)

In [None]:
def rename_strings(df, current = False):
    df = df.dropna(subset = ['question'])
    
    df.loc[df['question'].str.contains('Pre-COVID, how often'), 'question'] = 'preCOV_interact'
    df.loc[df['question'].str.contains('During-COVID, how of'), 'question'] = 'durCOV_interact'
    df.loc[df['question'].str.contains('Pre-COVID, how close'), 'question'] = 'preCOV_closeness'
    df.loc[df['question'].str.contains('During-COVID, how cl'), 'question'] = 'durCOV_closeness'
    df.loc[df['question'].str.contains('How far away did you'), 'question'] = 'preCOV_prox'
    df.loc[df['question'].str.contains('How far away do you '), 'question'] = 'durCOV_prox'
    df.loc[df['question'].str.contains('How long have you kn'), 'question'] = 'friendship_len'
    df.loc[df['question'].str.contains('During-COVID \(since '), 'question'] = 'durCOV_interact'
    
    
    
    # Only if it's the current friend dataframe
    if current:
        df.loc[df['question'].str.contains('... 1-on-1 video cha'), 'question'] = 'ind_vid'
        df.loc[df['question'].str.contains('... 1-on-1 text mess'), 'question'] = 'ind_txt'
        df.loc[df['question'].str.contains('... 1-on-1 in-person'), 'question'] = 'ind_person'
        df.loc[df['question'].str.contains('... group conference'), 'question'] = 'group_vid'
        df.loc[df['question'].str.contains('... group text messa'), 'question'] = 'group_txt'
        df.loc[df['question'].str.contains('... in-person \(face-'), 'question'] = 'group_person'
        df.loc[df['question'].str.contains('... other ways in a '), 'question'] = 'group_other'
        df.loc[df['question'].str.contains('... other ways \(spec'), 'question'] = 'ind_other'

    
    return df

In [None]:
def create_alter_boolean_df(stacked_ego_net_df, 
                            NG1_df, NG2_df, NG3_df, NG4_df, NG5_df, 
                            dorm_wing):    
    
    #dilter by dormwing
    stacked_ego_net_df = stacked_edgelist[stacked_edgelist.dorm_wing == dorm_wing]
    NG1_df = NG1_df[NG1_df.dorm_wing == dorm_wing]
    NG2_df = NG2_df[NG2_df.dorm_wing == dorm_wing]
    NG3_df = NG3_df[NG3_df.dorm_wing == dorm_wing]
    NG4_df = NG4_df[NG4_df.dorm_wing == dorm_wing]
    NG5_df = NG5_df[NG5_df.dorm_wing == dorm_wing]
    
    #get unique egos for each edge list. Will only check when ego is in the name generator
    NG1_unique_egos = NG1_df['ego'].unique()
    NG2_unique_egos = NG2_df['ego'].unique()
    NG3_unique_egos = NG3_df['ego'].unique()
    NG4_unique_egos = NG4_df['ego'].unique()
    NG5_unique_egos = NG5_df['ego'].unique()

    # loop to access all the pairs in a row
    for index, row in stacked_ego_net_df.iterrows():

        if row['ego'] in NG1_unique_egos:
                stacked_ego_net_df.at[index, ('NG1_comp')]= 1.0

        if row['ego'] in NG2_unique_egos:
                stacked_ego_net_df.at[index, ('NG2_comp')]= 1.0 

        if row['ego'] in NG3_unique_egos:
                stacked_ego_net_df.at[index, ('NG3_comp')]= 1.0

        if row['ego'] in NG4_unique_egos:
                stacked_ego_net_df.at[index, ('NG4_comp')]= 1.0

        if row['ego'] in NG5_unique_egos:
                stacked_ego_net_df.at[index, ('NG5_comp')]= 1.0


    grouped_NG1 = NG1_df.groupby('ego')
    grouped_NG2 = NG2_df.groupby('ego')
    grouped_NG3 = NG3_df.groupby('ego')
    grouped_NG4 = NG4_df.groupby('ego')
    grouped_NG5 = NG5_df.groupby('ego')

    # filter by dorm wing
    NG1_df = NG1_df[(NG1_df.dorm_wing == dorm_wing)]
    NG1_df = NG1_df.reset_index(drop = True)

    NG2_df = NG2_df[(NG2_df.dorm_wing == dorm_wing)]
    NG2_df = NG2_df.reset_index(drop = True)

    NG3_df = NG3_df[(NG3_df.dorm_wing == dorm_wing)]
    NG3_df = NG3_df.reset_index(drop = True)

    NG4_df = NG4_df[(NG4_df.dorm_wing == dorm_wing)]
    NG4_df = NG4_df.reset_index(drop = True)

    NG5_df = NG5_df[(NG5_df.dorm_wing == dorm_wing)]
    NG5_df = NG5_df.reset_index(drop = True)

    # remove duplicates from stacked ego network
    #stacked_ego_net_df = stacked_ego_net_df.drop_duplicates(ignore_index = True)

    # loop to access all the pairs in a row
    for index, row in stacked_ego_net_df.iterrows():

        #Check if ego entered a name for NG1
        if row['ego'] in NG1_unique_egos:
            if row['alter'] in grouped_NG1.get_group(row['ego'])['alter'].unique():
                stacked_ego_net_df.at[index, ('alter_NG1')]= 1.0

        if row['ego'] in NG2_unique_egos:
            if row['alter'] in grouped_NG2.get_group(row['ego'])['alter'].unique():
                stacked_ego_net_df.at[index, ('alter_NG2')]= 1.0

        if row['ego'] in NG3_unique_egos:
            if row['alter'] in grouped_NG3.get_group(row['ego'])['alter'].unique():
                stacked_ego_net_df.at[index, ('alter_NG3')]= 1.0

        if row['ego'] in NG4_unique_egos:
            if row['alter'] in grouped_NG4.get_group(row['ego'])['alter'].unique():
                stacked_ego_net_df.at[index, ('alter_NG4')]= 1.0             

        if row['ego'] in NG5_unique_egos:
            if row['alter'] in grouped_NG5.get_group(row['ego'])['alter'].unique():
                stacked_ego_net_df.at[index, ('alter_NG5')]= 1.0

    return(stacked_ego_net_df)

In [None]:
def create_alter_boolean_df(stacked_ego_net_df, 
                            NG1_df, NG2_df, NG3_df, NG4_df, NG5_df, 
                            dorm_wing, alter_col = 'alter_id'):    
    
    #dilter by dormwing
    stacked_ego_net_df = stacked_edgelist[stacked_edgelist.dorm_wing == dorm_wing]
    NG1_df = NG1_df[NG1_df.dorm_wing == dorm_wing]
    NG2_df = NG2_df[NG2_df.dorm_wing == dorm_wing]
    NG3_df = NG3_df[NG3_df.dorm_wing == dorm_wing]
    NG4_df = NG4_df[NG4_df.dorm_wing == dorm_wing]
    NG5_df = NG5_df[NG5_df.dorm_wing == dorm_wing]
    
    #get unique egos for each edge list. Will only check when ego is in the name generator
    NG1_unique_egos = NG1_df['ego'].unique()
    NG2_unique_egos = NG2_df['ego'].unique()
    NG3_unique_egos = NG3_df['ego'].unique()
    NG4_unique_egos = NG4_df['ego'].unique()
    NG5_unique_egos = NG5_df['ego'].unique()

    # loop to access all the pairs in a row
    for index, row in stacked_ego_net_df.iterrows():

        if row['ego'] in NG1_unique_egos:
                stacked_ego_net_df.at[index, ('NG1_comp')]= 1.0

        if row['ego'] in NG2_unique_egos:
                stacked_ego_net_df.at[index, ('NG2_comp')]= 1.0 

        if row['ego'] in NG3_unique_egos:
                stacked_ego_net_df.at[index, ('NG3_comp')]= 1.0

        if row['ego'] in NG4_unique_egos:
                stacked_ego_net_df.at[index, ('NG4_comp')]= 1.0

        if row['ego'] in NG5_unique_egos:
                stacked_ego_net_df.at[index, ('NG5_comp')]= 1.0


    grouped_NG1 = NG1_df.groupby('ego')
    grouped_NG2 = NG2_df.groupby('ego')
    grouped_NG3 = NG3_df.groupby('ego')
    grouped_NG4 = NG4_df.groupby('ego')
    grouped_NG5 = NG5_df.groupby('ego')

    # filter by dorm wing
    NG1_df = NG1_df[(NG1_df.dorm_wing == dorm_wing)]
    NG1_df = NG1_df.reset_index(drop = True)

    NG2_df = NG2_df[(NG2_df.dorm_wing == dorm_wing)]
    NG2_df = NG2_df.reset_index(drop = True)

    NG3_df = NG3_df[(NG3_df.dorm_wing == dorm_wing)]
    NG3_df = NG3_df.reset_index(drop = True)

    NG4_df = NG4_df[(NG4_df.dorm_wing == dorm_wing)]
    NG4_df = NG4_df.reset_index(drop = True)

    NG5_df = NG5_df[(NG5_df.dorm_wing == dorm_wing)]
    NG5_df = NG5_df.reset_index(drop = True)

    # remove duplicates from stacked ego network
    #stacked_ego_net_df = stacked_ego_net_df.drop_duplicates(ignore_index = True)

    # loop to access all the pairs in a row
    for index, row in stacked_ego_net_df.iterrows():

        #Check if ego entered a name for NG1
        if row['ego'] in NG1_unique_egos:
            if row[alter_col] in grouped_NG1.get_group(row['ego'])[alter_col].unique():
                stacked_ego_net_df.at[index, ('alter_NG1')]= 1.0

        if row['ego'] in NG2_unique_egos:
            if row[alter_col] in grouped_NG2.get_group(row['ego'])[alter_col].unique():
                stacked_ego_net_df.at[index, ('alter_NG2')]= 1.0

        if row['ego'] in NG3_unique_egos:
            if row[alter_col] in grouped_NG3.get_group(row['ego'])[alter_col].unique():
                stacked_ego_net_df.at[index, ('alter_NG3')]= 1.0

        if row['ego'] in NG4_unique_egos:
            if row[alter_col] in grouped_NG4.get_group(row['ego'])[alter_col].unique():
                stacked_ego_net_df.at[index, ('alter_NG4')]= 1.0             

        if row['ego'] in NG5_unique_egos:
            if row[alter_col] in grouped_NG5.get_group(row['ego'])[alter_col].unique():
                stacked_ego_net_df.at[index, ('alter_NG5')]= 1.0

    return(stacked_ego_net_df)

## Step 1: General Clean-up

In [None]:
# LOAD YOUR CSV FILES

# Change directory to where your CSVs are located
os.chdir(BASE_DIR + "working_directory/")

#raw data csv
all_SNS2_df = pd.read_csv("2020may_data_wide.csv")
personal_info_df = pd.read_csv("2020may_info.csv")
friends_current_df = pd.read_csv("friends_current_long.csv")
friends_past_df = pd.read_csv("friends_past_long.csv")
thrd_party_df = pd.read_csv("friendship_long.csv")
likert_df = pd.read_csv("likert_long.csv")

#filter csv
filter_df = pd.read_csv("filter_id.csv")

#ID dataframe
id_df = pd.read_csv("id_names.csv")

#column renamer
column_names_df = pd.read_csv("column_names.csv")

In [None]:
# ENTER FILE NAME OUTPUTS
sns2_filename = 'S1_all_SNS2_dataset.csv'
friends_current_filename = 'S1_all_friends_current.csv'
friends_past_filename = 'S1_all_friends_past.csv'
thrd_party_filename = 'S1_all_ego_network.csv'
likert_filename = 'S1_all_likert.csv'


In [None]:
#filter personal dataframe
print("PERSONAL")
print(personal_info_df.shape)
personal_info_df = filter_this(filter_df, personal_info_df)
print(personal_info_df.shape)

print("ALL DATA")
#filter entire dataset
print(all_SNS2_df.shape)
all_SNS2_df = filter_this(filter_df, all_SNS2_df)
print(all_SNS2_df.shape)

print("CURRENT FRIENDS")
#filter current friends
print(friends_current_df.shape)
friends_current_df = filter_this(filter_df, friends_current_df)
print(friends_current_df.shape)

#filter past friends
print("PAST FRIENDS")
print(friends_past_df.shape)
friends_past_df = filter_this(filter_df, friends_past_df)
print(friends_past_df.shape)

#filter third party
print("THIRD PARTY")
print(thrd_party_df.shape)
thrd_party_df = filter_this(filter_df, thrd_party_df)
print(thrd_party_df.shape)

#filter likert dataset
print("LIKERT DATASET")
print(likert_df.shape)
likert_df = filter_this(filter_df, likert_df)
print(likert_df.shape)

#filter id dataset
print("ID DATASET")
print(id_df.shape)
id_df = filter_this(filter_df, id_df)
print(id_df.shape)

In [None]:
# combine personal dataframe w/ SNS2
SNS2_df = pd.merge(personal_info_df, 
         all_SNS2_df, 
         how='outer', on='id')

In [None]:
# combine each response w/ their anonymous ids
print("ALL DATA")
print(SNS2_df.shape)
SNS2_df = pd.merge(id_df, 
         SNS2_df, 
         how='outer', on='id')
print(SNS2_df.shape)

print("FRIENDS CURRENT")
print(friends_current_df.shape)
friends_current_df = pd.merge(id_df, 
         friends_current_df, 
         how='outer', on='id')
print(friends_current_df.shape)
      
print("FRIENDS PAST")
print(friends_past_df.shape)
friends_past_df = pd.merge(id_df, 
         friends_past_df, 
         how='outer', on='id')
print(friends_past_df.shape)
      
print("THIRD PARTY")
print(thrd_party_df.shape)
thrd_party_df = pd.merge(id_df, 
         thrd_party_df, 
         how='outer', on='id')
print(thrd_party_df.shape)
      
print("LIKERT DATASET")
print(likert_df.shape)
likert_df = pd.merge(id_df, 
         likert_df, 
         how='outer', on='id')
print(likert_df.shape)
      

In [None]:
print("friends current")
missing_friends_current = where_empty(friends_current_df, False, True)
print('\n')

print("friends past")
missing_friends_past = where_empty(friends_past_df, False, True)
print('\n')

print("likert")
missing_likert = where_empty(likert_df, False, True)
print('\n')

print("third party")
missing_third_party = where_empty(thrd_party_df, False, True)
print('\n')

In [None]:
#question_list = column_names_df["likert_q_equivalent"].values.tolist()
question_list = column_names_df["question_4_that_column"].values.tolist()
column_list = column_names_df["final_column_names"].values.tolist()

for i in range(len(question_list)):
    likert_df['question'] = np.where(likert_df['question'] == question_list[i], column_list[i], likert_df['question'])
    


In [None]:
# Create your CSVs
os.chdir(BASE_DIR + "step1/")

SNS2_df.to_csv(sns2_filename, index = False)
friends_current_df.to_csv(friends_current_filename, index = False)
friends_past_df.to_csv(friends_past_filename, index = False)
thrd_party_df.to_csv(thrd_party_filename, index = False)
likert_df.to_csv(likert_filename, index = False)

# Step 2: Create edge lists

#### Within Excel: from the S1_all_SNS2_dataset.csv, I copied the first 6 columns and each NG section to its own csv

In [None]:
# LOAD YOUR FILES
os.chdir(BASE_DIR + "step1/")

social_current_df = pd.read_csv("current_social.csv")
virtual_current_df = pd.read_csv("current_virtual.csv")
social_media_df = pd.read_csv("social_media.csv")
aspire_df = pd.read_csv("aspire.csv")
social_precovid_df = pd.read_csv("precovid_social.csv")

In [None]:
# ENTER FILE NAME OUTPUTS
social_current_filename = 'S2_edgelist_social_current.csv'
virtual_current_filename = 'S2_edgelist_virtual_current.csv'
social_media_filename = 'S2_edgelist_social_media.csv'
aspire_filename = 'S2_edgelist_aspire.csv'
social_precovid_filename = 'S2_edgelist_social_precovid.csv'

In [None]:
id_df = aspire_df[['fmri_wave1', 'ego', 'dorm_wing_x', 'firstname_x', 'lastname_x', 'id']].copy()

In [None]:
# Update the type of connection type for each edge list
social_current_df = edgelist_creator(social_current_df)
social_current_df['connection'] = np.where(social_current_df['connection'] == '2.current_q.0.names_in', 'dorm', social_current_df['connection'])
social_current_df['connection'] = np.where(social_current_df['connection'] == '2.current_q.0.names_ou', 'outside', social_current_df['connection'])
print(social_current_df.head())

virtual_current_df = edgelist_creator(virtual_current_df)
virtual_current_df['connection'] = np.where(virtual_current_df['connection'] == '2.current_q.1.names_in', 'dorm', virtual_current_df['connection'])
virtual_current_df['connection'] = np.where(virtual_current_df['connection'] == '2.current_q.1.names_ou', 'outside', virtual_current_df['connection'])
print(virtual_current_df.head())

social_media_df = edgelist_creator(social_media_df)
social_media_df['connection'] = np.where(social_media_df['connection'] == '2.initial.0.names_in_d', 'dorm', social_media_df['connection'])
social_media_df['connection'] = np.where(social_media_df['connection'] == '2.initial.0.names_outs', 'outside', social_media_df['connection'])
print(social_media_df.head())

aspire_df = edgelist_creator(aspire_df)
aspire_df['connection'] = np.where(aspire_df['connection'] == '2.initial.1.names_in_d', 'dorm', aspire_df['connection'])
aspire_df['connection'] = np.where(aspire_df['connection'] == '2.initial.1.names_outs', 'outside', aspire_df['connection'])
print(aspire_df.head())

social_precovid_df = edgelist_creator(social_precovid_df)
social_precovid_df['connection'] = np.where(social_precovid_df['connection'] == '2.past_q.0.names_in_do', 'dorm', social_precovid_df['connection'])
social_precovid_df['connection'] = np.where(social_precovid_df['connection'] == '2.past_q.0.names_outsi', 'outside', social_precovid_df['connection'])
print(social_precovid_df.head())


In [None]:
# Merge with id dataframe and identify people that did not complete that dataframe
social_current_df = pd.merge(social_current_df, 
            id_df, 
            how='outer', on='ego')
social_current_df['firstname_x'] = social_current_df['firstname_x'].str.lower()
social_current_df['lastname_x'] = social_current_df['lastname_x'].str.lower()
print("SOCIAL CURRENT")
missing_social_current = where_empty(social_current_df, False, True)
print('\n')


social_precovid_df = pd.merge(social_precovid_df, 
            id_df, 
            how='outer', on='ego')
social_precovid_df['firstname_x'] = social_precovid_df['firstname_x'].str.lower()
social_precovid_df['lastname_x'] = social_precovid_df['lastname_x'].str.lower()
print("SOCIAL PRECOVID")
missing_social_precovid = where_empty(social_precovid_df, False, True)
print('\n')


aspire_df = pd.merge(aspire_df, 
            id_df, 
            how='outer', on='ego')
aspire_df['firstname_x'] = aspire_df['firstname_x'].str.lower()
aspire_df['lastname_x'] = aspire_df['lastname_x'].str.lower()
print("ASPIRE")
missing_aspire = where_empty(aspire_df, False, True)
print('\n')

social_media_df = pd.merge(social_media_df, 
            id_df, 
            how='outer', on='ego')
social_media_df['firstname_x'] = social_media_df['firstname_x'].str.lower()
social_media_df['lastname_x'] = social_media_df['lastname_x'].str.lower()
print("SOCIAL MEDIA")
missing_social_media = where_empty(social_media_df, False, True)
print('\n')

virtual_current_df = pd.merge(virtual_current_df, 
            id_df, 
            how='outer', on='ego')
virtual_current_df['firstname_x'] = virtual_current_df['firstname_x'].str.lower()
virtual_current_df['lastname_x'] = virtual_current_df['lastname_x'].str.lower()
print("VIRTUAL")
missing_vurtual = where_empty(social_media_df, False, True)
print('\n')

In [None]:
# Remove empty rows
social_current_df = social_current_df.dropna(subset=['alter'])
social_precovid_df = social_precovid_df.dropna(subset=['alter'])
aspire_df = aspire_df.dropna(subset=['alter'])

social_media_df = social_media_df.dropna(subset=['alter'])
virtual_current_df = virtual_current_df.dropna(subset=['alter'])

In [None]:
# Create your CSVs
os.chdir(BASE_DIR + "step2/")

social_current_df.to_csv(social_current_filename, index = False)
social_precovid_df.to_csv(social_precovid_filename, index = False)
aspire_df.to_csv(aspire_filename, index = False)
social_media_df.to_csv(social_media_filename, index = False)
virtual_current_df.to_csv(virtual_current_filename, index = False)

## Step 3: Create nickname and mispelling dictionaries
### Create lists of all unique names ever entered in each name generator

In [None]:
# LOAD YOUR FILES
os.chdir(BASE_DIR + "step2/")

social_current_df = pd.read_csv(social_current_filename)
virtual_current_df = pd.read_csv(virtual_current_filename)
social_media_df = pd.read_csv(social_media_filename)
aspire_df = pd.read_csv(aspire_filename)
social_precovid_df = pd.read_csv(social_precovid_filename)

# file for all dataset containing nicknames
os.chdir(BASE_DIR + "step1/")
all_SNS2_df = pd.read_csv('S1_all_SNS2_dataset.csv')

In [None]:
# ENTER FILENAMES
all_alters_filename = 'S3_alters_list.csv'
nickname_filename = 'S3_nicknames_list.csv'

In [None]:
# Create dataframe containing all unique alter spellings
social_current_df = alter_anonymizer_df(social_current_df)
social_precovid_df = alter_anonymizer_df(social_precovid_df)
aspire_df = alter_anonymizer_df(aspire_df)
social_media_df = alter_anonymizer_df(social_media_df)
virtual_current_df = alter_anonymizer_df(virtual_current_df)

In [None]:
#join all frames together
frames = [social_current_df, social_precovid_df, aspire_df, social_media_df, virtual_current_df]
all_alters_df = pd.concat(frames)

#remove all duplicates
all_alters_df.drop_duplicates()


In [None]:
# Create nickname dictionary
nickname_df = all_SNS2_df[['ego', 'lastname_x', '0.0.other_name', 'fmri_wave1', 'dorm_wing_x', 'id']].copy()

#remove whitespaces
nickname_df = nickname_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#drop people who don't have a nickname
nickname_df = nickname_df[nickname_df != str(0)].dropna()
nickname_df = nickname_df.reset_index(drop=True)

#lowercase nicknames and last names
nickname_df['lastname'] = nickname_df['lastname_x'].str.lower()
nickname_df['nickname_1'] = nickname_df['0.0.other_name'].str.lower()

#drop old columns
nickname_df = nickname_df.drop(columns=['0.0.other_name', 'lastname_x'])

#split by ", " and create new column for second nicknames
new_data = nickname_df["nickname_1"].str.split(", ", n = 1, expand = True)
nickname_df["nickname_1"] = new_data[0]
nickname_df["nickname_2"] = new_data[1]


In [None]:
# Create CSV files for each dataset
os.chdir(BASE_DIR + "step3/")

nickname_df.to_csv(nickname_filename, index = False)
all_alters_df.to_csv(all_alters_filename, index = False)

## Step 4: Format long datasets
##### S1_all_ego_network: 
create new columns of alter_1, alter_2, alter_1_connection, alter_2_connection (requires string splitting)

##### S1_all_likert: 
transpose the dataframe so that new columns for each question, rename columns by index for columns that have weird string malfunctions

##### S1_all_friends_current: 
create new columns of alter, alter_connection, question > transpose so that columns made for each question based on the start of the string 
##### S1_all_friends_past: 
create new columns of alter, alter_connection, question > transpose so that columns made for each question based on the start of the string 

In [None]:
# LOAD YOUR FILES
os.chdir(BASE_DIR + "step1/")

friends_current_df = pd.read_csv(friends_current_filename)
friends_past_df = pd.read_csv(friends_past_filename)
third_party_df = pd.read_csv(thrd_party_filename)
likert_df = pd.read_csv(likert_filename)

In [None]:
# ENTER FILE NAME OUTPUTS
os.chdir(BASE_DIR + "step4/")

s4_sns2_filename = 'S4_all_SNS2_dataset.csv'
s4_friends_current_filename = 'S4_all_friends_current.csv'
s4_friends_past_filename = 'S4_all_friends_past.csv'
s4_thrd_party_filename = 'S4_all_ego_network.csv'
s4_likert_filename = 'S4_all_likert.csv'
s4_specification_filename = 'S4_specification_all_friends_current.csv'

#### Likert dataset pivot

In [None]:
# Pivot your likert dataframe
likert_df = likert_df.pivot(index = ['ego', 'dorm_wing', 'fmri_wave1', 'id'], columns = 'question', values='response')
likert_df = likert_df.reset_index(drop = False)


# Update column names that have weird strings
column_names = likert_df.columns

likert_df = likert_df.rename({column_names[4]: 'open_business', column_names[5]: 'skip_class', 
    column_names[6]: 'sleep_imp', column_names[7]: 'dining_hall', column_names[8]: 'friends_disagree', 
    column_names[9]: 'hungover_normal', column_names[10]: 'caffeine_normal', column_names[11]: 'alc_blackout',
    column_names[12]: 'sleep_normal', column_names[13]: 'all_night_normal', column_names[14]: 'vape_use_all',
    column_names[15]: 'vape_use_nicotine'}, axis=1)

#### create new alter columns and remove unnecessary string characters
friends_current_df, friends_past_df and ego networks

In [None]:
# friends_current_df
alter_df = alter_df_func(friends_current_df)

friends_current_df['connection'] = alter_df['connection']
friends_current_df['alter'] = alter_df['alter']
friends_current_df['alter_id'] = alter_df['alter_id']


In [None]:
# friends_past_df
alter_df = alter_df_func(friends_past_df)

friends_past_df['connection'] = alter_df['connection']
friends_past_df['alter'] = alter_df['alter']
friends_past_df['alter_id'] = alter_df['alter_id']


In [None]:
# do the same for each alter column in your ego network dataset
alter_df = alter_df_func(third_party_df, 'name1', 'alter1', 'alter1_id', 'alter1_connection')

third_party_df['alter1_connection'] = alter_df['alter1_connection']
third_party_df['alter1'] = alter_df['alter1']
third_party_df['alter1_id'] = alter_df['alter1_id']


In [None]:
alter_df = alter_df_func(third_party_df, 'name2', 'alter2', 'alter2_id', 'alter2_connection')

third_party_df['alter2_connection'] = alter_df['alter2_connection']
third_party_df['alter2'] = alter_df['alter2']
third_party_df['alter2_id'] = alter_df['alter2_id']


#### remove unnecessary columns in ego network dataset

In [None]:
# remove ego network columns: 'name1', 'name2', 'firstname', 'lastname', 'timestamp
third_party_df = third_party_df.drop(columns = ['name1', 'name2', 'firstname', 'lastname', 'timestamp'])

# remove current and past columns: 'name', firstname', 'lastname', 'timestamp, 'response_text'
friends_past_df = friends_past_df.drop(columns = ['name', 'firstname', 'lastname', 'timestamp', 'response_text'])
friends_current_df = friends_current_df.drop(columns = ['name', 'firstname', 'lastname', 'timestamp', 'response_text'])


#### rename each question for friends_past_df and friends_current_df

In [None]:
friends_current_df = rename_strings(friends_current_df, True)
friends_past_df = rename_strings(friends_past_df)


#### Create specification dataset

In [None]:
# Create specification csv file
specification_df = friends_current_df[['fmri_wave1', 'ego', 'dorm_wing', 'id', 'question', 'specification', 'connection', 'alter', 'alter_id']].copy()
specification_df = specification_df[specification_df != str(0)].dropna()
specification_df = specification_df[specification_df.specification.notnull()]
specification_df = specification_df[specification_df['specification'] != 'never']
specification_df = specification_df[specification_df['specification'] != 'none']
specification_df = specification_df[specification_df['specification'] != 'None']
specification_df = specification_df[specification_df['specification'] != 'None ']


#### Pivot your friends_past_df and friends_current_df

In [None]:
# Pivot your dataframes
friends_current_df = friends_current_df.pivot(index = ['ego', 'dorm_wing', 'fmri_wave1', 'id', 'connection', 'alter_id', 'alter'], 
                               columns = ['question'], values = 'response')
friends_current_df = friends_current_df.reset_index(drop = False)

friends_past_df = friends_past_df.pivot(index = ['ego', 'dorm_wing', 'fmri_wave1', 'id', 'connection', 'alter_id', 'alter'], 
                               columns = ['question'], values = 'response')
friends_past_df = friends_past_df.reset_index(drop = False)


#### Create CSV files for each dataset

In [None]:
os.chdir(BASE_DIR + "step4/")

third_party_df.to_csv(s4_thrd_party_filename, index = False)
friends_current_df.to_csv(s4_friends_current_filename, index = False)
friends_past_df.to_csv(s4_friends_past_filename, index = False)
likert_df.to_csv(s4_likert_filename, index = False)
specification_df.to_csv(s4_specification_filename, index = False)

## Step 5: Revise alters
#### Revisions include: misspellings, nicknames, incorrect connection category
##### Following datasets are revised: 
S2_edgelist_aspire.csv, S2_edgelist_social_current.csv, S2_edgelist_social_media.csv, S2_edgelist_social_precovid.csv, S2_edgelist_virtual_current.csv, S4_all_ego_network.csv, S4_all_friends_current.csv, S4_all_friends_past.csv, S4_specification_all_friends_current.csv

In [None]:
# LOAD YOUR FILES to revise

# Edgelists
os.chdir(BASE_DIR + "step2/")

aspire_edgelist = pd.read_csv('S2_edgelist_aspire.csv')
social_current_edgelist = pd.read_csv('S2_edgelist_social_current.csv')
social_media_edgelist = pd.read_csv('S2_edgelist_social_media.csv')
social_precovid_edgelist = pd.read_csv('S2_edgelist_social_precovid.csv')
virtual_current_edgelist = pd.read_csv('S2_edgelist_virtual_current.csv')

# Other datasets
os.chdir(BASE_DIR + "step4/")

ego_network_df = pd.read_csv('S4_all_ego_network.csv')
friends_current_df = pd.read_csv('S4_all_friends_current.csv')
friends_past_df = pd.read_csv('S4_all_friends_past.csv')
specification_df = pd.read_csv('S4_specification_all_friends_current.csv')

# Revision csv
os.chdir(BASE_DIR + "working_directory/")

revisions_df = pd.read_csv('revisions.csv')
robert_revisions = pd.read_csv('s002_social_media_additions.csv')


In [None]:
# Create filenames

# edgelists
s5_aspire_edgelist_filename = 'S5_aspire_edgelist.csv'
s5_social_current_edgelist_filename = 'S5_social_current_edgelist.csv'
s5_social_media_edgelist_filename = 'S5_social_media_edgelist.csv'
s5_social_precovid_edgelist_filename = 'S5_social_precovid_edgelist.csv'
s5_virtual_current_edgelist_filename = 'S5_virtual_current_edgelist.csv'

# other datasets
s5_ego_network_filename = 'S5_all_ego_network.csv'
s5_friends_current_filename = 'S5_all_friends_current.csv'
s5_friends_past_filename = 'S5_all_friends_past.csv'
s5_specification_filename = 'S5_specification_all_friends_current.csv'


In [None]:
# add robert's alters to social media dataframe
social_media_edgelist.update(robert_revisions)


In [None]:
# for each edgelist, create alter_id column, rename dorm_wing_x column and remove unecessary columns

alter_id_df = alter_anonymizer_df(aspire_edgelist)
aspire_edgelist['alter_id'] = alter_id_df['alter_id']
aspire_edgelist = aspire_edgelist.rename(columns = {'dorm_wing_x':'dorm_wing'})
aspire_edgelist = aspire_edgelist.drop(columns = ['firstname_x', 'lastname_x'])

alter_id_df = alter_anonymizer_df(social_current_edgelist)
social_current_edgelist['alter_id'] = alter_id_df['alter_id']
social_current_edgelist = social_current_edgelist.rename(columns = {'dorm_wing_x':'dorm_wing'})
social_current_edgelist = social_current_edgelist.drop(columns = ['firstname_x', 'lastname_x'])

alter_id_df = alter_anonymizer_df(social_media_edgelist)
social_media_edgelist['alter_id'] = alter_id_df['alter_id']
social_media_edgelist = social_media_edgelist.rename(columns = {'dorm_wing_x':'dorm_wing'})
social_media_edgelist = social_media_edgelist.drop(columns = ['firstname_x', 'lastname_x'])

alter_id_df = alter_anonymizer_df(social_precovid_edgelist)
social_precovid_edgelist['alter_id'] = alter_id_df['alter_id']
social_precovid_edgelist = social_precovid_edgelist.rename(columns = {'dorm_wing_x':'dorm_wing'})
social_precovid_edgelist = social_precovid_edgelist.drop(columns = ['firstname_x', 'lastname_x'])

alter_id_df = alter_anonymizer_df(virtual_current_edgelist)
virtual_current_edgelist['alter_id'] = alter_id_df['alter_id']
virtual_current_edgelist = virtual_current_edgelist.rename(columns = {'dorm_wing_x':'dorm_wing'})
virtual_current_edgelist = virtual_current_edgelist.drop(columns = ['firstname_x', 'lastname_x'])

In [None]:
# revise all the alters

social_precovid_edgelist = revise_my_alters(social_precovid_edgelist, revisions_df)
social_current_edgelist = revise_my_alters(social_current_edgelist, revisions_df)
virtual_current_edgelist = revise_my_alters(virtual_current_edgelist, revisions_df)
social_media_edgelist = revise_my_alters(social_media_edgelist, revisions_df)
aspire_edgelist = revise_my_alters(aspire_edgelist, revisions_df)
specification_df = revise_my_alters(specification_df, revisions_df)

friends_current_df = revise_my_alters(friends_current_df, revisions_df)
friends_past_df = revise_my_alters(friends_past_df, revisions_df)


In [None]:
# revise alters in ego networks
ego_network_df = revise_my_ego_netwrk_alters(ego_network_df, revisions_df)

In [None]:
os.chdir(BASE_DIR + "step5/")

aspire_edgelist.to_csv(s5_aspire_edgelist_filename, index = False)
social_current_edgelist.to_csv(s5_social_current_edgelist_filename, index = False)
social_media_edgelist.to_csv(s5_social_media_edgelist_filename, index = False)
social_precovid_edgelist.to_csv(s5_social_precovid_edgelist_filename, index = False)
virtual_current_edgelist.to_csv(s5_virtual_current_edgelist_filename, index = False)

friends_current_df.to_csv(s5_friends_current_filename, index = False)
friends_past_df.to_csv(s5_friends_past_filename, index = False)
ego_network_df.to_csv(s5_ego_network_filename, index = False)
specification_df.to_csv(s5_specification_filename, index = False)

## Step 6: Boolean Datasets
#### create a dataset w/ booleans values stating whether alter was entered for each name generator and whether ego completed that name generator
##### COLUMNS: 
ego, dorm_wing, alter, alter_id, connection, aspire, social_current, social_media, social_precovid, virtual_current


In [None]:
# LOAD YOUR Edgelists

os.chdir(BASE_DIR + "step5/")

aspire_edgelist = pd.read_csv('S5_aspire_edgelist.csv')
social_current_edgelist = pd.read_csv('S5_social_current_edgelist.csv')
social_media_edgelist = pd.read_csv('S5_social_media_edgelist.csv')
social_precovid_edgelist = pd.read_csv('S5_social_precovid_edgelist.csv')
virtual_current_edgelist = pd.read_csv('S5_virtual_current_edgelist.csv')

ego_network_df = pd.read_csv('S5_all_ego_network.csv')

In [None]:
# Create filenames
S6_alter_boolean_filename = 'S6_alter_boolean.csv'

##### Begin alter_boolean dataset

In [None]:
# Create stacked edgelist containing all unique alters entered for each ego
edgelist_list = [aspire_edgelist, social_current_edgelist, social_media_edgelist, social_precovid_edgelist, virtual_current_edgelist]  # List of your dataframes
stacked_edgelist = pd.concat(edgelist_list)

#remove duplicates based on alter_id
stacked_edgelist = stacked_edgelist.drop_duplicates(['dorm_wing', 'ego','alter_id', 'connection'], keep= 'last')


In [None]:
#stacked_ego_net_df
south_alter_df = create_alter_boolean_df(stacked_edgelist, 
                             aspire_edgelist, social_current_edgelist, social_media_edgelist,
                            social_precovid_edgelist, virtual_current_edgelist, 'south')

north_alter_df = create_alter_boolean_df(stacked_edgelist, 
                             aspire_edgelist, social_current_edgelist, social_media_edgelist,
                            social_precovid_edgelist, virtual_current_edgelist, 'north')


In [None]:
# merge both datasets
alter_df = pd.merge(north_alter_df, 
             south_alter_df, how = 'outer')

In [None]:
# Keep only necessary columns and fill w/ 0s
alter_boolean_df = alter_df[['ego', 'alter', 'connection', 'fmri_wave1', 'dorm_wing', 'id',
       'alter_id', 'alter_NG1', 'alter_NG2', 'alter_NG3', 'alter_NG4', 'alter_NG5']].copy()

alter_boolean_df.fillna(0, inplace=True)

#### Create ego boolean dataset
This dataset that reports whether an ego entered a name for each name generator

In [None]:
ego_boolean_df = alter_df[['ego', 'fmri_wave1', 'dorm_wing', 'id',
       'NG1_comp', 'NG2_comp', 'NG3_comp', 'NG4_comp', 'NG5_comp']].copy()


ego_boolean_df = ego_boolean_df.drop_duplicates(ignore_index = True)
ego_boolean_df.fillna(0, inplace=True)

#### Rename your columns

In [None]:
ego_boolean_df = ego_boolean_df.rename({'NG1_comp': 'aspire', 'NG2_comp': 'social_current', 
                                'NG3_comp': 'social_media', 'NG4_comp': 'social_precovid',
                                'NG5_comp': 'virtual_current'}, axis=1)


alter_boolean_df = alter_boolean_df.rename({'alter_NG1': 'aspire', 'alter_NG2': 'social_current', 
                                'alter_NG3': 'social_media', 'alter_NG4': 'social_precovid',
                                'alter_NG5': 'virtual_current'}, axis=1)

#### Create your files

In [None]:
os.chdir(BASE_DIR + "step6/")

ego_boolean_df.to_csv('S6_ego_boolean.csv', index = False)
alter_boolean_df.to_csv('S6_alter_boolean.csv', index = False)

## Step 7: Final clean up
Check if there are any alters that might actually be different people

Average ratings that are for the same alter

In [None]:
# LOAD YOUR DATASETS
os.chdir(BASE_DIR + "step5/")

aspire_edgelist = pd.read_csv('S5_aspire_edgelist.csv')
social_current_edgelist = pd.read_csv('S5_social_current_edgelist.csv')
social_media_edgelist = pd.read_csv('S5_social_media_edgelist.csv')
social_precovid_edgelist = pd.read_csv('S5_social_precovid_edgelist.csv')
virtual_current_edgelist = pd.read_csv('S5_virtual_current_edgelist.csv')

friends_current_df = pd.read_csv('S5_all_friends_current.csv')
friends_past_df = pd.read_csv('S5_all_friends_past.csv')

ego_net_df = pd.read_csv('S5_all_ego_network.csv')

In [None]:
# Create filenames

# EDGELISTS
aspire_edgelist_filename = 'S7_edgelist_aspire.csv'
social_current_filename = 'S7_edgelist_social_current.csv'
social_media_filename = 'S7_edgelist_social_media.csv'
social_precovid_filename = 'S7_edgelist_social_precovid.csv'
virtual_current_filename = 'S7_edgelist_virtual_current.csv'

# RATINGS
friendship_ratings_filename = 'S7_friendship_ratings.csv'
unmerged_friends_current_filename = 'S7_friendship_current_ratings.csv'
unmerged_friends_past_filename = 'S7_friendship_past_ratings.csv'

# EGO NETWORK
ego_net_df_filename = 'S7_ego_network.csv'



##### Average ratings for alters that are the same but entered w/ mispellings

In [None]:
# average ratings for same alters in closeness/interaction/modality ratings
friend_current_col = friends_current_df.columns[7:]
for col in friend_current_col:
    friends_current_df[col] = friends_current_df.groupby(['dorm_wing', 'ego', 'connection', 'alter_id'])[col].transform('mean')

friend_past_col = friends_past_df.columns[7:]
for col in friend_past_col:
    friends_past_df[col] = friends_past_df.groupby(['dorm_wing', 'ego', 'connection', 'alter_id'])[col].transform('mean')


In [None]:
unmerged_friends_past_df = friends_past_df
unmerged_friends_past_df = unmerged_friends_past_df.drop_duplicates(['dorm_wing', 'ego', 'connection', 'alter_id'])



In [None]:
unmerged_friends_current_df = friends_current_df
unmerged_friends_current_df = unmerged_friends_current_df.drop_duplicates(['dorm_wing', 'ego', 'connection', 'alter_id'])


##### Delete duplicates

In [None]:
#print out each shape
print(aspire_edgelist.shape)
print(social_current_edgelist.shape)
print(social_media_edgelist.shape)
print(social_precovid_edgelist.shape)
print(virtual_current_edgelist.shape)
#print(friends_current_df.shape)
#print(friends_past_df.shape)
print(ego_net_df.shape)

In [None]:
# drop duplicates in each dataset
aspire_edgelist = aspire_edgelist.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
social_current_edgelist = social_current_edgelist.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
social_media_edgelist = social_media_edgelist.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
social_precovid_edgelist = social_precovid_edgelist.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
virtual_current_edgelist = virtual_current_edgelist.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
#friends_current_df = friends_current_df.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
#friends_past_df = friends_past_df.drop_duplicates(['dorm_wing', 'ego', 'alter_id', 'connection'], keep= 'last')
ego_net_df = ego_net_df.drop_duplicates(['dorm_wing', 'ego', 'alter1_id', 'alter2_id','alter1_connection', 'alter2_connection'], keep= 'last')

In [None]:
# print out resulting shape
print('\n')
print(aspire_edgelist.shape)
print(social_current_edgelist.shape)
print(social_media_edgelist.shape)
print(social_precovid_edgelist.shape)
print(virtual_current_edgelist.shape)
#print(friends_current_df.shape)
#print(friends_past_df.shape)
print(ego_net_df.shape)

##### Create your final files

In [None]:
os.chdir(BASE_DIR + "step7/")

# EDGELISTS
aspire_edgelist.to_csv(aspire_edgelist_filename, index = False)
social_current_edgelist.to_csv(social_current_filename, index = False)
social_media_edgelist.to_csv(social_media_filename, index = False)
social_precovid_edgelist.to_csv(social_precovid_filename, index = False)
virtual_current_edgelist.to_csv(virtual_current_filename, index = False)

# RATINGS
#friends_current_df.to_csv(friendship_ratings_filename, index = False)
unmerged_friends_current_df.to_csv(unmerged_friends_current_filename, index = False)
unmerged_friends_past_df.to_csv(unmerged_friends_past_filename, index = False)

# EGO NETWORK
ego_net_df.to_csv(ego_net_df_filename, index = False)

## Step 7b: Average the ratings in a merged dataset
Manually, combine both the current and long friendship datasets from step 5 and store in working_directory folder. Run r script "step7_friendship_ratings.R" for your final merged friendship ratings

### EXTRA: Checking long friendship datasets

In [None]:
# Load the datasets
os.chdir(BASE_DIR + "step5/")

friends_current_df = pd.read_csv('S5_all_friends_current.csv')
friends_past_df = pd.read_csv('S5_all_friends_past.csv')


In [None]:
# Merge both datasets
new_df = pd.merge(friends_current_df, friends_past_df,
                  how = 'outer',
                  on = ['ego', 'dorm_wing', 'fmri_wave1', 'id', 'connection', 'alter_id','alter'],
                  suffixes=('_current', '_past'))

new_df['conflict_ratings'] = new_df.duplicated(['ego', 'dorm_wing', 'fmri_wave1', 'id', 'connection', 'alter'], keep = False)

new_df['alter_lookup'] = new_df["ego"] + new_df["alter"] + new_df["connection"] + new_df["dorm_wing"]
new_df['alter_id_lookup'] = new_df["ego"] + new_df["alter_id"] + new_df["connection"] + new_df["dorm_wing"]


In [None]:
new_df = new_df.reindex(sorted(new_df.columns), axis=1)

In [None]:
os.chdir(BASE_DIR + "extra/")
new_df.to_csv('merged_friendship_rating_comparisons_UNORDERED_COLUMNS.csv', index = False)

In [None]:
print('merged')
print(len(np.unique(new_df['alter_id_lookup'])))
print(len(np.unique(new_df['alter_lookup'])))

### EXTRA: Create ego and alter boolean datasets based on raw entries (column "alter")

In [None]:
# LOAD YOUR Edgelists

os.chdir(BASE_DIR + "step5/")

aspire_edgelist = pd.read_csv('S5_aspire_edgelist.csv')
social_current_edgelist = pd.read_csv('S5_social_current_edgelist.csv')
social_media_edgelist = pd.read_csv('S5_social_media_edgelist.csv')
social_precovid_edgelist = pd.read_csv('S5_social_precovid_edgelist.csv')
virtual_current_edgelist = pd.read_csv('S5_virtual_current_edgelist.csv')

ego_network_df = pd.read_csv('S5_all_ego_network.csv')

In [None]:
# Create filenames
extra_alter_boolean_filename = 'extra_alter_boolean.csv'


In [None]:
# Create stacked edgelist containing all unique alters entered for each ego
edgelist_list = [aspire_edgelist, social_current_edgelist, social_media_edgelist, social_precovid_edgelist, virtual_current_edgelist]  
stacked_edgelist = pd.concat(edgelist_list)

#remove duplicates based on alter
stacked_edgelist = stacked_edgelist.drop_duplicates(['dorm_wing', 'ego','alter', 'connection'], keep= 'last')


In [None]:
south_alter_df = create_alter_boolean_df(stacked_edgelist, 
                             aspire_edgelist, social_current_edgelist, social_media_edgelist,
                            social_precovid_edgelist, virtual_current_edgelist, 'south', 'alter')

north_alter_df = create_alter_boolean_df(stacked_edgelist, 
                             aspire_edgelist, social_current_edgelist, social_media_edgelist,
                            social_precovid_edgelist, virtual_current_edgelist, 'north', 'alter')


In [None]:
# merge both datasets
alter_df = pd.merge(north_alter_df, 
             south_alter_df, how = 'outer')

# Keep only necessary columns and fill w/ 0s
alter_df = alter_df[['ego', 'alter', 'connection', 'fmri_wave1', 'dorm_wing', 'id',
       'alter_id', 'alter_NG1', 'alter_NG2', 'alter_NG3', 'alter_NG4', 'alter_NG5']].copy()

alter_df.fillna(0, inplace=True)

In [None]:
alter_df = alter_df.rename({'alter_NG1': 'aspire', 'alter_NG2': 'social_current', 
                                'alter_NG3': 'social_media', 'alter_NG4': 'social_precovid',
                                'alter_NG5': 'virtual_current'}, axis=1)

In [None]:
# Create new files
os.chdir(BASE_DIR + "extra/")

alter_df.to_csv(extra_alter_boolean_filename, index = False)

In [None]:
print('Done!')