In [2]:
import common
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import seaborn.objects as so
import statsmodels.api as sm
import statsmodels.formula.api as smf
from IPython.display import display

from analysis import data

In [5]:
df = pd.read_parquet('../data/interim/linearity-metrics.parq')
correctness = common.load_correctness_data().set_index(['pid', 'tid'])
metadata = pd.read_parquet("../data/processed/experiment_metadata.parq")


df= df.join(correctness).join(metadata[['years_of_experience', 'perceived_difficulty']])
annotations = data.load_yaml('../data/raw/annotations.yaml')
linearity_scores = {k:v['metrics']['linearity'] for k, v in annotations.items()}
df['linearity'] = df['snippet'].map(linearity_scores.get)
df.loc[df['linearity'] <= 1, 'linearity_bin'] = 'A'
df.loc[(df['linearity'] > 1) & (df['linearity'] <= 2), 'linearity_bin'] = 'B'
df.loc[(df['linearity'] > 2) & (df['linearity'] <= 10), 'linearity_bin'] = 'C'
df.loc[(df['linearity'] > 10) & (df['linearity'] <= 20), 'linearity_bin'] = 'D'
complexity = {'numbers_hrn': "D",
 'numbers_hrd': "D",
 'graph_utils': "B",
 'calculation': "B",
 'insertion_sort': "C",
 'money_class': "A",
 'number_checker': "B",
 'rectangle': "A"}

df['complexity'] = df['snippet'].map(complexity.get)
df['linearity_bin'] = df['linearity_bin'].astype('category')
open_source_snippets = ['numbers_hrn', 'numbers_hrd', 'graph_utils']
df['snippet'] = df['snippet'].cat.remove_unused_categories()
df.loc[df['snippet'].isin(open_source_snippets), 'snippet_type'] = 'open_source'
df.loc[~df['snippet'].isin(open_source_snippets), 'snippet_type'] = 'synthesized'

df.to_parquet('../data/interim/linearity-metrics-for-r.parq')
display(df.head())


Unnamed: 0_level_0,Unnamed: 1_level_0,story_order,execution_order,regression_rate,line_regression_rate,horizontal_later,vertical_later,vertical_next,snippet,correct,years_of_experience,perceived_difficulty,linearity,linearity_bin,complexity,snippet_type
pid,tid,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
p100,t2,-0.545932,-0.587927,0.485437,0.184466,0.199029,0.694175,0.762136,rectangle,False,2.0,Extremely easy,20.0,D,A,synthesized
p100,t3,-0.445833,-0.45,0.40367,0.082569,0.183486,0.669725,0.605505,calculation,False,2.0,Neither easy nor difficult,1.64,B,B,synthesized
p100,t4,-0.593567,-0.611111,0.47486,0.156425,0.206704,0.675978,0.782123,insertion_sort,False,2.0,Somewhat easy,3.24,C,C,synthesized
p101,t1,-0.472381,-0.592381,0.486166,0.13834,0.16996,0.648221,0.675889,number_checker,True,2.0,Extremely easy,9.9,C,B,synthesized
p101,t2,-0.511327,-0.543689,0.444444,0.124183,0.202614,0.673203,0.784314,money_class,True,2.0,Extremely easy,0.0,A,A,synthesized


In [10]:
fixations = pd.read_parquet('../data/processed/fixations-fixed.parq')
correctness = common.load_correctness_data().set_index(['pid', 'tid'])

fixations = fixations.join(correctness)

In [19]:
fixations.reset_index()[['pid', 'tid', 'correct', 'snippet']].drop_duplicates().groupby([ 'snippet', 'correct',]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,pid,tid
snippet,correct,Unnamed: 2_level_1,Unnamed: 3_level_1
calculation,False,4,4
calculation,True,13,13
distance,False,0,0
distance,True,2,2
graph_utils,False,16,16
graph_utils,True,6,6
insertion_sort,False,9,9
insertion_sort,True,6,6
money_class,False,1,1
money_class,True,14,14


In [6]:

metrics = ['vertical_next', 'vertical_later', 'regression_rate', 'line_regression_rate' ]
for metric in metrics[:1]:
    print(f"\n\nMetric: {metric}")
    print("=" * 80)
    modelling_data = df.reset_index()
    model = smf.mixedlm(
        f"{metric} ~ linearity_bin + years_of_experience ",
        modelling_data.dropna(),
        groups=modelling_data['pid']
    )
    mdf = model.fit()
    print(mdf.summary())
    print(sm.stats.anova_lm(mdf))



Metric: vertical_next
            Mixed Linear Model Regression Results
Model:             MixedLM  Dependent Variable:  vertical_next
No. Observations:  155      Method:              REML         
No. Groups:        63       Scale:               0.0021       
Min. group size:   1        Log-Likelihood:      213.2420     
Max. group size:   4        Converged:           Yes          
Mean group size:   2.5                                        
--------------------------------------------------------------
                    Coef.  Std.Err.   z    P>|z| [0.025 0.975]
--------------------------------------------------------------
Intercept            0.770    0.017 45.057 0.000  0.737  0.804
linearity_bin[T.B]  -0.041    0.014 -2.889 0.004 -0.069 -0.013
linearity_bin[T.C]  -0.037    0.012 -3.069 0.002 -0.060 -0.013
linearity_bin[T.D]  -0.002    0.010 -0.235 0.814 -0.021  0.017
years_of_experience -0.001    0.006 -0.259 0.796 -0.013  0.010
Group Var            0.001    0.012         



AttributeError: 'MixedLMResults' object has no attribute 'ssr'

In [66]:
sm.stats.anova_lm(mdf, )

AttributeError: 'MixedLMResults' object has no attribute 'ssr'

In [70]:
pietek_data = pd.read_csv("../data/interim/pietek_data.csv")

pietek_data = pietek_data.astype({
    'Participant': 'category',
    'Snippet': 'category',
    'Linearity': 'category',
})


In [81]:
model = smf.mixedlm("VerticalLater ~ Expert + Scrambled + Linearity", pietek_data, groups=pietek_data['Participant'])
mf = model.fit()

print(mf.summary())


           Mixed Linear Model Regression Results
Model:            MixedLM Dependent Variable: VerticalLater
No. Observations: 298     Method:             REML         
No. Groups:       31      Scale:              0.0019       
Min. group size:  7       Log-Likelihood:     469.7906     
Max. group size:  10      Converged:          Yes          
Mean group size:  9.6                                      
-----------------------------------------------------------
                  Coef. Std.Err.   z    P>|z| [0.025 0.975]
-----------------------------------------------------------
Intercept         0.273    0.009 30.344 0.000  0.255  0.290
Expert[T.True]    0.026    0.009  2.942 0.003  0.009  0.044
Scrambled[T.True] 0.010    0.005  1.890 0.059 -0.000  0.021
Linearity[T.B]    0.016    0.008  1.980 0.048  0.000  0.032
Linearity[T.C]    0.026    0.008  3.229 0.001  0.010  0.042
Linearity[T.D]    0.045    0.008  5.646 0.000  0.029  0.061
Linearity[T.E]    0.038    0.008  4.741 0.000  0.02



In [91]:
pietek_data.query('Expert == False').groupby("Snippet", )[
    [
        "Regression",
        "LineRegression",
        "HorizontalLater",
        "VerticalLater",
        "VerticalNext",
    ]
].mean()


Unnamed: 0_level_0,Regression,LineRegression,HorizontalLater,VerticalLater,VerticalNext
Snippet,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Calculation,0.265912,0.222289,0.234604,0.268426,0.149005
CheckIfLettersOnly,0.28248,0.173297,0.214809,0.320922,0.180971
InsertSort,0.285159,0.177069,0.230764,0.301973,0.154367
MoneyClass,0.244933,0.175114,0.258978,0.265339,0.194153
Rectangle,0.283792,0.174884,0.214827,0.31837,0.130282
SignChecker,0.289466,0.181399,0.213975,0.308666,0.147924
Street,0.28638,0.183483,0.214217,0.303661,0.142305
Student,0.271118,0.153101,0.207558,0.351026,0.161233
SumArray,0.313773,0.17681,0.21788,0.269351,0.136512
Vehicle,0.283914,0.160934,0.237637,0.311975,0.145954


1