# Load packages

In [1]:
import pandas as pd
import json
from glob import glob
import re
from Levenshtein import distance as levenshtein_distance
from itertools import product

# Clean data (export wide form data for CDER group)

In [8]:
files = [f for f in glob('C:/Users/Cole/Documents/DATA/Network_DATA/*') if 'wide' not in f]
for f in files:
    df = pd.read_csv(f)
    df.question_content = df.question_content.apply(json.loads)
    df.answers = df.answers.apply(json.loads)

    ids = df.user_id.unique()
    student_data = []
    for id in ids:
        df_one = df.loc[df.user_id == id, :].reset_index(drop = True)
        wide_data = {'user_id':df_one.loc[0, 'user_id'], 'name':df_one.loc[0, 'name']}
        for r, data in df_one.iterrows():
            questions = data['question_content']
            questions = [q['value'] for q in questions if q['type'] == 'text']

            answers = data['answers']
            answers = [answers[a] for a in answers.keys()] 
            wide_data.update(dict(zip(questions, answers)))

        student_data.append(wide_data)
    pd.DataFrame.from_dict(student_data).to_csv(f[:-4] + '_wide.csv', index = False)

# Load and clean network data

In [7]:
cols = ['user_id', 'name', 
        'Please list any students in this physics class that you think are particularly strong in the lecture/discussion section material.', 
        'Please list any students in this physics class that you had a meaningful interaction* with about other aspects of the course this week.',
        'Please list any students in this physics class that you think are particularly strong in the lab material.',
        'Please list any students in this physics class that you had a meaningful interaction* with about lab material this week.']
wide_files = glob('C:/Users/Cole/Documents/DATA/Network_DATA/*wide.csv')

for f in wide_files:
    df = pd.read_csv(f, usecols = cols)[cols].rename(columns = dict(zip(cols, ['user_id', 'name', 'strong_lect_disc', 
                                                                               'meaning_other', 'strong_lab', 
                                                                               'meaning_lab']))).fillna('')
    df.name = df.name.str.lower()
    df.strong_lect_disc = df.strong_lect_disc.str.lower()
    df.meaning_other = df.meaning_other.str.lower()
    df.strong_lab = df.strong_lab.str.lower()
    df.meaning_lab = df.meaning_lab.str.lower()

    df['first_name'] = df.name.str.split(' ', n = 1, expand = True)[0]
    df['last_name'] = df.name.str.split(' ').str[-1]
    df['duplicated_first'] = df.duplicated(subset = 'first_name', keep = False)
    df['duplicated_last'] = df.duplicated(subset = 'last_name', keep = False)
    df.loc[df.duplicated_first == True, 'first_name'] = ''
    df.loc[df.duplicated_last == True, 'last_name'] = ''
    df = df.drop(columns = ['duplicated_first', 'duplicated_last'])

    df['full_name'] = df.name.str.replace(' ', '')

    dfs = []
    for attr in ['strong_lect_disc', 'meaning_other', 'strong_lab', 'meaning_lab']:
        connections = []
        for r, data in df.iterrows():
            sentence = data[attr]
            tokens = re.split(' |,|\n|;|\.', sentence)
            tokens = [t for t in tokens if t != '']

            first_names = [n for n in list(df.first_name) if n != '']
            student_connections = [pair[1] for pair in list(product(tokens, first_names)) if levenshtein_distance(pair[0], 
                                                                                                                  pair[1]) == 0]
            if student_connections:
                connections.append(list(product([data['name']], df.loc[df.first_name.isin(student_connections), 
                                                                       'name'].tolist())))

            last_names = [n for n in list(df.last_name) if n != '']
            student_connections = [pair[1] for pair in list(product(tokens, last_names)) if levenshtein_distance(pair[0], 
                                                                                                                 pair[1]) == 0]
            if student_connections:
                connections.append(list(product([data['name']], df.loc[df.last_name.isin(student_connections), 
                                                                       'name'].tolist())))

            paired_tokens = [''.join(pair) for pair in zip(tokens[:-1], tokens[1:])]
            full_names = list(df.full_name)
            student_connections = [pair[1] for pair in list(product(paired_tokens, 
                                                                    full_names)) if levenshtein_distance(pair[0], pair[1]) <= 
                                   0.3 * len(pair[1])]
            
            if student_connections:
                connections.append(list(product([data['name']], df.loc[df.full_name.isin(student_connections), 
                                                                       'name'].tolist())))

        edgelist = pd.DataFrame(sorted(list(set([c for student in connections for c in student]))), columns = ['From', 'To'])
        edgelist['attr'] = attr
        dfs.append(edgelist)
    df_out = pd.concat(dfs, axis = 0).reset_index(drop = True)
    df_out.to_csv(f[:-8] + 'edgelist.csv', index = False)
    
    df_out[['From', 'To']] = df_out[['From', 'To']].stack().rank(method = 'dense').astype(int).unstack()
    df_out.to_csv(f[:-8] + 'edgelist_anon.csv', index = False)