# Linear Mixed-Effects Model

In [15]:
import pandas as pd

# load an Excel file
file_path = './Participant Questionnaire- cleaning.xlsx'  # replace the correct file path
sheet_name = 'Participant Questionnaire_Decem'  # replace the correct sheet name
data = pd.read_excel(file_path, sheet_name=sheet_name)

# define column name mapping
question_names = [
    "Mental Demand",
    "Physical Demand",
    "Hurried or Rushed",
    "Successful",
    "Effort",
    "Negative feeling"
]

# traverse through task columns Q13-Q18, each task has 6 questions
for task_num in range(13, 19):  # Q13 to Q18
    for question_num in range(1, 7):  # 6 questions per task
        old_column_name = f"Q{task_num}_{question_num}"
        new_column_name = f"Task{task_num-12}_{question_names[question_num-1]}"
        if old_column_name in data.columns:
            data.rename(columns={old_column_name: new_column_name}, inplace=True)

# save the modified file
output_file_path = 'Participant Questionnaire - Renamed.xlsx'
data.to_excel(output_file_path, index=False)
print(f"文件已保存为 {output_file_path}")

文件已保存为 Participant Questionnaire - Renamed.xlsx


weight


In [20]:
import pandas as pd

# load data
file_path = './Participant Questionnaire - Renamed.xlsx'  # replace the correct file path
sheet_name = 'Sheet1'  # replace the correct sheet name
data = pd.read_excel(file_path, sheet_name=sheet_name)

# define the weight
weights = {
    "Mental Demand": 0.45,
    "Physical Demand": 0.01,
    "Hurried or Rushed": 0.01,
    "Successful": 0.06,
    "Effort": 0.27,
    "Negative feeling": 0.20  # replace 'Discouraged, Irritated, or Stressed'
}

# calculate the weighted score for each task
for task_num in range(1, 7):  # corresponding to Task1-Task6
    # extract the relevant columns
    task_columns = [f"Task{task_num}_{dimension}" for dimension in weights.keys()]
    # check if the column exists
    task_columns = [col for col in task_columns if col in data.columns]
    # calculate the weighted score
    data[f"Task{task_num}_WeightedScore"] = data[task_columns].apply(
        lambda row: sum(row[dimension] * weights[dimension.split('_', 1)[-1]] for dimension in task_columns),
        axis=1
    )

# define task mapping
task_mapping = {
    1: ("Easy", "Title"),
    2: ("Easy", "All"),
    3: ("Medium", "Title"),
    4: ("Medium", "All"),
    5: ("Hard", "Title"),
    6: ("Hard", "All")
}

# change the name of the weighted score column
for task_num, (difficulty, text_case) in task_mapping.items():
    old_column_name = f"Task{task_num}_WeightedScore"  # formal column name
    new_column_name = f"{difficulty}_{text_case}_WeightedScore"  # new column name
    if old_column_name in data.columns:
        data.rename(columns={old_column_name: new_column_name}, inplace=True)

# save the result
output_file_path = 'Participant Questionnaire - Renamed and Weighted.xlsx'
data.to_excel(output_file_path, index=False)
print(f"列名已更新并保存为 {output_file_path}")


列名已更新并保存为 Participant Questionnaire - Renamed and Weighted.xlsx


In [41]:
file_path = 'Participant Questionnaire - Renamed and Weighted.xlsx'
data = pd.read_excel(file_path)
print(data.columns)

Index(['StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
       'ExternalReference', 'LocationLatitude', 'LocationLongitude',
       'DistributionChannel', 'UserLanguage', 'Q1', 'Q2', 'Q3', 'Q3_5_TEXT',
       'Q4', 'Q5', 'Q6', 'LanguageBackground', 'Q8', 'Q9', 'Q10', 'Q10_4_TEXT',
       'Q11', 'Q12', 'Task1_Mental Demand', 'Task1_Physical Demand',
       'Task1_Hurried or Rushed', 'Task1_Successful', 'Task1_Effort',
       'Task1_Negative feeling', 'Task2_Mental Demand',
       'Task2_Physical Demand', 'Task2_Hurried or Rushed', 'Task2_Successful',
       'Task2_Effort', 'Task2_Negative feeling', 'Task3_Mental Demand',
       'Task3_Physical Demand', 'Task3_Hurried or Rushed', 'Task3_Successful',
       'Task3_Effort', 'Task3_Negative feeling', 'Task4_Mental Demand',
       'Task4_Physical Demand', 'Task4_Hurried or Rushed', 'Task4_Suc

In [49]:
from statsmodels.formula.api import mixedlm
import pandas as pd

# load data
file_path = 'Participant Questionnaire - Renamed and Weighted.xlsx'
data = pd.read_excel(file_path)


# create the 'Format' and 'Difficulty' columns
# iterate through each weighted score column
weighted_score_columns = [
    "Easy_Title_WeightedScore",
    "Easy_All_WeightedScore",
    "Medium_Title_WeightedScore",
    "Medium_All_WeightedScore",
    "Hard_Title_WeightedScore",
    "Hard_All_WeightedScore"
]

formatted_data = []

for col in weighted_score_columns:
    temp_df = data[['Participant', col]].copy()
    temp_df = temp_df.rename(columns={col: "WeightedScore"})
    
    # parse the 'Format' and 'Difficulty
    if "Title" in col:
        temp_df["Format"] = "Title Case"
    elif "All" in col:
        temp_df["Format"] = "All Upper Case"
    
    if "Easy" in col:
        temp_df["Difficulty"] = "Easy"
    elif "Medium" in col:
        temp_df["Difficulty"] = "Medium"
    elif "Hard" in col:
        temp_df["Difficulty"] = "Hard"
    
    formatted_data.append(temp_df)

# merge the parsed data.
formatted_data = pd.concat(formatted_data, ignore_index=True)

# ensure the 'Format' and 'Difficulty' columns are categorical variables
formatted_data['Format'] = pd.Categorical(formatted_data['Format'], categories=['Title Case', 'All Upper Case'])
formatted_data['Difficulty'] = pd.Categorical(formatted_data['Difficulty'], categories=['Easy', 'Medium', 'Hard'])
formatted_data['LanguageBackground'] = data['LanguageBackground']
# define the overall analysis function
def overall_analysis(data):
    print("\n=== Overall Analysis ===")
    model = mixedlm("WeightedScore ~ Format", data, groups="Participant", re_formula="~1")
    result = model.fit()
    coef = result.params.get("Format[T.All Upper Case]", None)
    p_value = result.pvalues.get("Format[T.All Upper Case]", None)
    print(f"Coefficient (Format[T.All Upper Case]): {coef:.4f}")
    print(f"P-value (Format[T.All Upper Case]): {p_value:.4f}")
    if p_value < 0.05:
        if coef > 0:
            print("Result: All Upper Case significantly increases WeightedScore.")
        else:
            print("Result: All Upper Case significantly decreases WeightedScore.")

# define the task difficulty analysis function
def analyze_by_difficulty(data):
    print("\n=== Analysis by Task Difficulty ===")
    difficulties = data['Difficulty'].unique()
    for difficulty in difficulties:
        print(f"\nTask Difficulty: {difficulty}")
        filtered_data = data[data['Difficulty'] == difficulty]
        model = mixedlm("WeightedScore ~ Format", filtered_data, groups="Participant", re_formula="~1")
        result = model.fit()
        coef = result.params.get("Format[T.All Upper Case]", None)
        p_value = result.pvalues.get("Format[T.All Upper Case]", None)
        print(f"Coefficient (Format[T.All Upper Case]): {coef:.4f}")
        print(f"P-value (Format[T.All Upper Case]): {p_value:.4f}")
        if p_value < 0.05:
            if coef > 0:
                print(f"Result: For {difficulty} tasks, All Upper Case significantly increases WeightedScore.")
            else:
                print(f"Result: For {difficulty} tasks, All Upper Case significantly decreases WeightedScore.")

# perform analysis
overall_analysis(formatted_data)
analyze_by_difficulty(formatted_data)


=== Overall Analysis ===
Coefficient (Format[T.All Upper Case]): 0.2623
P-value (Format[T.All Upper Case]): 0.0365
Result: All Upper Case significantly increases WeightedScore.

=== Analysis by Task Difficulty ===

Task Difficulty: Easy
Coefficient (Format[T.All Upper Case]): 0.1061
P-value (Format[T.All Upper Case]): 0.1612

Task Difficulty: Medium
Coefficient (Format[T.All Upper Case]): 0.3878
P-value (Format[T.All Upper Case]): 0.0024
Result: For Medium tasks, All Upper Case significantly increases WeightedScore.

Task Difficulty: Hard
Coefficient (Format[T.All Upper Case]): 0.2930
P-value (Format[T.All Upper Case]): 0.0439
Result: For Hard tasks, All Upper Case significantly increases WeightedScore.


In [50]:
print(data.columns)

Index(['Participant', 'StartDate', 'EndDate', 'Status', 'IPAddress',
       'Progress', 'Duration (in seconds)', 'Finished', 'RecordedDate',
       'ResponseId', 'RecipientLastName', 'RecipientFirstName',
       'RecipientEmail', 'ExternalReference', 'LocationLatitude',
       'LocationLongitude', 'DistributionChannel', 'UserLanguage', 'Q1', 'Q2',
       'Q3', 'Q3_5_TEXT', 'Q4', 'Q5', 'Q6', 'LanguageBackground', 'Q8', 'Q9',
       'Q10', 'Q10_4_TEXT', 'Q11', 'Q12', 'Task1_Mental Demand',
       'Task1_Physical Demand', 'Task1_Hurried or Rushed', 'Task1_Successful',
       'Task1_Effort', 'Task1_Negative feeling', 'Task2_Mental Demand',
       'Task2_Physical Demand', 'Task2_Hurried or Rushed', 'Task2_Successful',
       'Task2_Effort', 'Task2_Negative feeling', 'Task3_Mental Demand',
       'Task3_Physical Demand', 'Task3_Hurried or Rushed', 'Task3_Successful',
       'Task3_Effort', 'Task3_Negative feeling', 'Task4_Mental Demand',
       'Task4_Physical Demand', 'Task4_Hurried or Rush

In [56]:
# initialize formatted_data as an empty list
formatted_data = []

#loop through each weighted score column and parse the Format and Difficulty information
for col in weighted_score_columns:
    # create a temporary DataFrame, containing participant and the current column
    temp_df = data[['Participant', col]].copy()
    temp_df = temp_df.rename(columns={col: "WeightedScore"})
    
    # parse the 'Format' and 'Difficulty
    if "Title" in col:
        temp_df["Format"] = "Title Case"
    elif "All" in col:
        temp_df["Format"] = "All Upper Case"
    
    if "Easy" in col:
        temp_df["Difficulty"] = "Easy"
    elif "Medium" in col:
        temp_df["Difficulty"] = "Medium"
    elif "Hard" in col:
        temp_df["Difficulty"] = "Hard"
    
    # add the parsed DataFrame to the list
    formatted_data.append(temp_df)

# merge all DataFrame
formatted_data = pd.concat(formatted_data, ignore_index=True)

# ensure the Format and Difficulty columns are categorical variables
formatted_data['Format'] = pd.Categorical(formatted_data['Format'], categories=['Title Case', 'All Upper Case'])
formatted_data['Difficulty'] = pd.Categorical(formatted_data['Difficulty'], categories=['Easy', 'Medium', 'Hard'])

# fix the LanguageBackground column
if 'LanguageBackground' in data.columns:
    # correctly repeat the values of the LanguageBackground column to match the number of rows in formatted_data
    formatted_data['LanguageBackground'] = data['LanguageBackground'].repeat(len(weighted_score_columns)).reset_index(drop=True)
else:
    raise ValueError("The dataset does not contain a 'LanguageBackground' column.")

# fix Participant column
formatted_data['Participant'] = data['Participant'].repeat(len(weighted_score_columns)).reset_index(drop=True)

# check data integrity
print(formatted_data[['Participant', 'LanguageBackground', 'Format', 'Difficulty']].head())
print(f"Total rows: {len(formatted_data)}")

   Participant LanguageBackground      Format Difficulty
0            1                  1  Title Case       Easy
1            1                  1  Title Case       Easy
2            1                  1  Title Case       Easy
3            1                  1  Title Case       Easy
4            1                  1  Title Case       Easy
Total rows: 138


In [57]:
# define language background analysis function
def analyze_by_language_background(data):
    print("\n=== Analysis by Language Background ===")
    # create mixed-effects model
    model = mixedlm("WeightedScore ~ Format * LanguageBackground", data, groups="Participant", re_formula="~1")
    result = model.fit()
    
    # extract the coefficients and p-values of the main effects and interaction effects
    coef_language = result.params.get("LanguageBackground[T.2]", None)
    coef_interaction = result.params.get("Format[T.All Upper Case]:LanguageBackground[T.2]", None)
    p_value_language = result.pvalues.get("LanguageBackground[T.2]", None)
    p_value_interaction = result.pvalues.get("Format[T.All Upper Case]:LanguageBackground[T.2]", None)
    
    # print the result
    print(f"Coefficient (LanguageBackground[T.2]): {coef_language:.4f}")
    print(f"P-value (LanguageBackground[T.2]): {p_value_language:.4f}")
    print(f"Coefficient (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): {coef_interaction:.4f}")
    print(f"P-value (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): {p_value_interaction:.4f}")
    
    # explain the result
    if p_value_language < 0.05:
        if coef_language > 0:
            print("Result: Language Group 2 has significantly higher WeightedScore.")
        else:
            print("Result: Language Group 2 has significantly lower WeightedScore.")
    else:
        print("Result: Language background has no significant main effect.")
    
    if p_value_interaction < 0.05:
        if coef_interaction > 0:
            print("Result: Language Group 2 is more sensitive to All Upper Case format.")
        else:
            print("Result: Language Group 1 is more sensitive to All Upper Case format.")
    else:
        print("Result: No significant interaction between Language Background and Format.")

# perform the language background analysis.
analyze_by_language_background(formatted_data)


=== Analysis by Language Background ===
Coefficient (LanguageBackground[T.2]): -0.0425
P-value (LanguageBackground[T.2]): 0.8972
Coefficient (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): 0.0461
P-value (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): 0.9109
Result: Language background has no significant main effect.
Result: No significant interaction between Language Background and Format.


Analyze the differences in responses to different text formats by users from different language backgrounds across varying task difficulties

In [None]:
from statsmodels.formula.api import mixedlm

# define function: Comprehensive analysis (interaction effects of language background, task difficulty, and text format)
def comprehensive_analysis(data):
    print("\n=== Comprehensive Analysis ===")
    # create mixed-effects model
    model = mixedlm("WeightedScore ~ Format * LanguageBackground * Difficulty", data, groups="Participant", re_formula="~1")
    result = model.fit()
    print(result.summary())
    
    # extract p-values and coefficients
    p_format = result.pvalues.get("Format[T.All Upper Case]", None)
    p_interaction_lg = result.pvalues.get("Format[T.All Upper Case]:LanguageBackground[T.2]", None)
    p_interaction_dif = result.pvalues.get("Format[T.All Upper Case]:Difficulty[T.Hard]", None)
    p_interaction_all = result.pvalues.get("Format[T.All Upper Case]:LanguageBackground[T.2]:Difficulty[T.Hard]", None)
    
    # print the result
    print(f"P-value (Format[T.All Upper Case]): {p_format:.4f}")
    print(f"P-value (Format[T.All Upper Case] * LanguageBackground[T.2]): {p_interaction_lg:.4f}")
    print(f"P-value (Format[T.All Upper Case] * Difficulty[T.Hard]): {p_interaction_dif:.4f}")
    print(f"P-value (Format[T.All Upper Case] * LanguageBackground[T.2] * Difficulty[T.Hard]): {p_interaction_all:.4f}")

# define function: Group analysis by task difficulty and language background
def analyze_by_task_and_language(data, difficulties):
    print(f"\n=== Analyzing Weighted Scores by Task Difficulty and Language Background ===")
    results = {}
    for difficulty in difficulties:
        print(f"\nTask Difficulty: {difficulty}")
        # filter data for the current task difficulty
        filtered_data = data[data['Difficulty'] == difficulty]
        
        # create mixed-effects model
        model = mixedlm("WeightedScore ~ Format * LanguageBackground", filtered_data, groups="Participant", re_formula="~1")
        result = model.fit()
        results[difficulty] = result
        
        # extract p-values and coefficients
        coef_format = result.params.get("Format[T.All Upper Case]", None)
        coef_interaction = result.params.get("Format[T.All Upper Case]:LanguageBackground[T.2]", None)
        p_format = result.pvalues.get("Format[T.All Upper Case]", None)
        p_interaction = result.pvalues.get("Format[T.All Upper Case]:LanguageBackground[T.2]", None)
        
        # print the result
        print(f"Coefficient (Format[T.All Upper Case]): {coef_format:.4f}")
        print(f"P-value (Format[T.All Upper Case]): {p_format:.4f}")
        print(f"Coefficient (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): {coef_interaction:.4f}")
        print(f"P-value (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): {p_interaction:.4f}")
        
        # explain the result
        if p_interaction < 0.05:
            if coef_interaction > 0:
                print(f"Result: For {difficulty} tasks, LanguageBackground 2 is more sensitive to All Upper Case format.")
            else:
                print(f"Result: For {difficulty} tasks, LanguageBackground 1 is more sensitive to All Upper Case format.")
    return results




# define task difficulty
difficulties = ["Easy", "Medium", "Hard"]

# comprehensive analysis
comprehensive_analysis(formatted_data)

# analyze by task difficulty and language background
analyze_by_task_and_language(formatted_data, difficulties)


=== Comprehensive Analysis ===
                                     Mixed Linear Model Regression Results
Model:                               MixedLM                  Dependent Variable:                  WeightedScore
No. Observations:                    138                      Method:                              REML         
No. Groups:                          23                       Scale:                               0.5246       
Min. group size:                     6                        Log-Likelihood:                      -156.9820    
Max. group size:                     6                        Converged:                           Yes          
Mean group size:                     6.0                                                                        
----------------------------------------------------------------------------------------------------------------
                                                                      Coef.  Std.Err.   z    P>|z| [0.

  dat = dat.applymap(lambda x: _formatter(x, float_format))


Coefficient (Format[T.All Upper Case]): 0.1764
P-value (Format[T.All Upper Case]): 0.3576
Coefficient (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): -0.2742
P-value (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): 0.0145
Result: For Easy tasks, LanguageBackground 1 is more sensitive to All Upper Case format.

Task Difficulty: Medium
Coefficient (Format[T.All Upper Case]): 0.4098
P-value (Format[T.All Upper Case]): 0.3149
Coefficient (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): 0.0157
P-value (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): 0.9783

Task Difficulty: Hard
Coefficient (Format[T.All Upper Case]): 0.4378
P-value (Format[T.All Upper Case]): 0.5207
Coefficient (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): -0.0083
P-value (Interaction: Format[T.All Upper Case] * LanguageBackground[T.2]): 0.9927


{'Easy': <statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x7ff0ba944d00>,
 'Medium': <statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x7ff0ea559c40>,
 'Hard': <statsmodels.regression.mixed_linear_model.MixedLMResultsWrapper at 0x7ff0ba94f9a0>}