In [87]:
import pandas as pd
import numpy as np
import random as rd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [88]:
# load data
var_list = []
for i in range(2,7):
    path = '../EPM_dataset/Data/Seyoung/variables_session_' + str(i) + '.csv'
    var = pd.read_csv(path)
    var_list.append(var)

# verify
var_list[0].head()

Unnamed: 0,ID,DUR_Aulaweb,DUR_Blank,DUR_Deeds,DUR_Diagram,DUR_Other,DUR_Properties,DUR_Study,DUR_TextEditor,KS_Aulaweb,...,MM_TextEditor,MW_Aulaweb,MW_Blank,MW_Deeds,MW_Diagram,MW_Other,MW_Study,MW_TextEditor,MWC_Aulaweb,MWC_Study
0,1,1108.0,15296.0,33635659.0,8685148.0,39820.0,3493.0,100878318.0,118425808.0,0.0,...,97360.0,0.0,5.0,54.0,0.0,55.0,176.0,1191.0,0.0,0.0
1,2,13063.0,27628.0,152742872.0,898633.0,419913.0,294227.0,14462337.0,347521629.0,0.0,...,34849.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,6891.0,1894304.0,64732143.0,779097.0,3677630.0,93.0,118419039.0,424103241.0,0.0,...,37932.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,37073.0,538424.0,126805821.0,255194.0,3453213.0,1170.0,1972457.0,196191366.0,29.0,...,34394.0,12.0,5.0,95.0,0.0,51.0,157.0,2022.0,0.0,0.0
4,5,8813.0,11555.0,79309264.0,347358.0,49656.0,9502.0,48292987.0,275261872.0,0.0,...,87666.0,2.0,0.0,106.0,0.0,4.0,150.0,1280.0,0.0,0.0


In [89]:
# Scaler
scaled_var_list = []
for var in var_list:
    # I'm selecting only numericals to scale
    numerical = var.select_dtypes(include='float64').columns
    # This will transform the selected columns and merge to the original data frame
    var.loc[:,numerical] = StandardScaler().fit_transform(var.loc[:,numerical])
    scaled_var_list.append(var)
    
# verify
scaled_var_list[0]

Unnamed: 0,ID,DUR_Aulaweb,DUR_Blank,DUR_Deeds,DUR_Diagram,DUR_Other,DUR_Properties,DUR_Study,DUR_TextEditor,KS_Aulaweb,...,MM_TextEditor,MW_Aulaweb,MW_Blank,MW_Deeds,MW_Diagram,MW_Other,MW_Study,MW_TextEditor,MWC_Aulaweb,MWC_Study
0,1,-0.180077,-0.223501,-0.731890,1.869931,-0.321309,-0.187415,-0.021704,-0.748815,-0.225401,...,0.521447,-0.504441,-0.096924,0.091200,-0.289216,-0.024701,-0.136857,0.078659,-0.227076,-0.283103
1,2,-0.179064,-0.222049,0.070617,-0.176423,-0.286618,7.977765,-0.568710,0.277439,-0.225401,...,-0.452700,-0.504441,-0.141128,-0.669317,-0.289216,-0.226519,-0.579112,-0.681333,-0.227076,-0.283103
2,3,-0.179587,-0.002275,-0.522372,-0.207838,0.010717,-0.282903,0.089327,0.620493,-0.225401,...,-0.404656,-0.504441,-0.141128,-0.669317,-0.289216,-0.226519,-0.579112,-0.681333,-0.227076,-0.283103
3,4,-0.177029,-0.161910,-0.104139,-0.345523,-0.009766,-0.252656,-0.647769,-0.400458,0.439615,...,-0.459791,-0.127260,-0.096924,0.668630,-0.289216,-0.039379,-0.184600,0.608930,-0.227076,-0.283103
4,5,-0.179424,-0.223941,-0.424156,-0.321302,-0.320411,-0.018654,-0.354565,-0.046255,-0.225401,...,0.370379,-0.441578,-0.141128,0.823550,-0.289216,-0.211841,-0.202190,0.135451,-0.227076,-0.283103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,101,-0.150673,-0.220802,-0.749819,-0.410013,-0.140200,-0.284588,-0.586811,-0.386493,-0.225401,...,0.110008,-0.504441,-0.141128,-0.669317,-0.289216,-0.226519,-0.579112,-0.681333,-0.227076,-0.283103
78,102,-0.179400,-0.221470,-0.634870,-0.401461,-0.303834,-0.285515,-0.408132,-0.519668,-0.225401,...,-0.995773,-0.504441,-0.141128,-0.669317,-0.289216,-0.226519,-0.579112,-0.681333,-0.227076,-0.283103
79,103,-0.176630,-0.224005,-0.398235,0.414184,-0.303077,0.620722,0.361131,1.119425,-0.225401,...,-0.766990,1.130009,-0.105765,-0.641149,-0.289216,-0.222849,1.109501,0.139280,-0.227076,-0.283103
80,104,-0.155397,-0.216145,0.127902,-0.050916,-0.302861,-0.179523,0.793276,-0.373707,-0.225401,...,-0.216795,-0.504441,-0.141128,-0.458062,-0.289216,-0.087081,-0.347933,-0.219339,-0.227076,-0.283103


In [90]:
# reduce dimensionality
dim_label_list = []
dim_list = []
for scaled_var in scaled_var_list:
    scaled_var = scaled_var.drop(columns=['ID'])
    pca = PCA(n_components=12)
    pca.fit(scaled_var) # calculate loading scores and the variation
    pca_data = pca.transform(scaled_var) # generate coordinates
    print(pca.feature_names_in_)
#     print(pca.components_)
    per_var = np.round(pca.explained_variance_ratio_*100, decimals=1)
    print(per_var)
    dim_label_list.append(pca.feature_names_in_[:12])
    print("\n")
    

['DUR_Aulaweb' 'DUR_Blank' 'DUR_Deeds' 'DUR_Diagram' 'DUR_Other'
 'DUR_Properties' 'DUR_Study' 'DUR_TextEditor' 'KS_Aulaweb' 'KS_Blank'
 'KS_Deeds' 'KS_Diagram' 'KS_Other' 'KS_Properties' 'KS_Study'
 'KS_TextEditor' 'MCL_Aulaweb' 'MCL_Blank' 'MCL_Deeds' 'MCL_Diagram'
 'MCL_Other' 'MCL_Properties' 'MCL_Study' 'MCL_TextEditor' 'MCR_Aulaweb'
 'MCR_Blank' 'MCR_Deeds' 'MCR_Diagram' 'MCR_Other' 'MCR_Study'
 'MCR_TextEditor' 'MM_Aulaweb' 'MM_Blank' 'MM_Deeds' 'MM_Diagram'
 'MM_Other' 'MM_Properties' 'MM_Study' 'MM_TextEditor' 'MW_Aulaweb'
 'MW_Blank' 'MW_Deeds' 'MW_Diagram' 'MW_Other' 'MW_Study' 'MW_TextEditor'
 'MWC_Aulaweb' 'MWC_Study']
[17.   7.8  7.1  6.3  5.4  5.   4.2  3.8  3.7  3.5  3.   2.7]


['DUR_Aulaweb' 'DUR_Blank' 'DUR_Deeds' 'DUR_Diagram' 'DUR_Other'
 'DUR_Properties' 'DUR_Study' 'DUR_TextEditor' 'KS_Aulaweb' 'KS_Blank'
 'KS_Deeds' 'KS_Diagram' 'KS_Other' 'KS_Properties' 'KS_Study'
 'KS_TextEditor' 'MCL_Aulaweb' 'MCL_Blank' 'MCL_Deeds' 'MCL_Diagram'
 'MCL_Other' 'MCL_Properties

In [91]:
dim_label_list

[array(['DUR_Aulaweb', 'DUR_Blank', 'DUR_Deeds', 'DUR_Diagram',
        'DUR_Other', 'DUR_Properties', 'DUR_Study', 'DUR_TextEditor',
        'KS_Aulaweb', 'KS_Blank', 'KS_Deeds', 'KS_Diagram'], dtype=object),
 array(['DUR_Aulaweb', 'DUR_Blank', 'DUR_Deeds', 'DUR_Diagram',
        'DUR_Other', 'DUR_Properties', 'DUR_Study', 'DUR_TextEditor',
        'KS_Aulaweb', 'KS_Blank', 'KS_Deeds', 'KS_Diagram'], dtype=object),
 array(['DUR_Aulaweb', 'DUR_Blank', 'DUR_Deeds', 'DUR_Diagram',
        'DUR_Other', 'DUR_Properties', 'DUR_Study', 'DUR_TextEditor',
        'KS_Aulaweb', 'KS_Blank', 'KS_Deeds', 'KS_Diagram'], dtype=object),
 array(['DUR_Aulaweb', 'DUR_Blank', 'DUR_Deeds', 'DUR_Diagram',
        'DUR_Other', 'DUR_Properties', 'DUR_Study', 'DUR_TextEditor',
        'KS_Aulaweb', 'KS_Blank', 'KS_Deeds', 'KS_Diagram'], dtype=object),
 array(['DUR_Aulaweb', 'DUR_Blank', 'DUR_Deeds', 'DUR_Diagram', 'DUR_FSM',
        'DUR_Other', 'DUR_Properties', 'DUR_Study', 'DUR_TextEditor',
        'KS_Aul

In [92]:
# test
labels = list(dim_label_list[0])
labels = ['ID'] + labels
scaled_var_list[0][labels]

Unnamed: 0,ID,DUR_Aulaweb,DUR_Blank,DUR_Deeds,DUR_Diagram,DUR_Other,DUR_Properties,DUR_Study,DUR_TextEditor,KS_Aulaweb,KS_Blank,KS_Deeds,KS_Diagram
0,1,-0.180077,-0.223501,-0.731890,1.869931,-0.321309,-0.187415,-0.021704,-0.748815,-0.225401,-0.409872,-0.359030,-0.365649
1,2,-0.179064,-0.222049,0.070617,-0.176423,-0.286618,7.977765,-0.568710,0.277439,-0.225401,-0.409872,-0.163113,-0.365649
2,3,-0.179587,-0.002275,-0.522372,-0.207838,0.010717,-0.282903,0.089327,0.620493,-0.225401,-0.409872,-0.268846,-0.365649
3,4,-0.177029,-0.161910,-0.104139,-0.345523,-0.009766,-0.252656,-0.647769,-0.400458,0.439615,-0.409872,-0.038721,1.171952
4,5,-0.179424,-0.223941,-0.424156,-0.321302,-0.320411,-0.018654,-0.354565,-0.046255,-0.225401,0.608598,3.083515,-0.365649
...,...,...,...,...,...,...,...,...,...,...,...,...,...
77,101,-0.150673,-0.220802,-0.749819,-0.410013,-0.140200,-0.284588,-0.586811,-0.386493,-0.225401,-0.409872,-0.853488,-0.365649
78,102,-0.179400,-0.221470,-0.634870,-0.401461,-0.303834,-0.285515,-0.408132,-0.519668,-0.225401,-0.409872,-0.415006,-0.365649
79,103,-0.176630,-0.224005,-0.398235,0.414184,-0.303077,0.620722,0.361131,1.119425,-0.225401,-0.409872,0.290918,6.435277
80,104,-0.155397,-0.216145,0.127902,-0.050916,-0.302861,-0.179523,0.793276,-0.373707,-0.225401,0.155945,2.974673,2.886968


In [93]:
for i, var in enumerate(scaled_var_list):
    labels = list(dim_label_list[i])
    labels = ['ID'] + labels
    session = var[labels]
    file_name = "scaled_variables_session_" + str(i+2) + ".csv"
    session.to_csv('../EPM_dataset/Data/Seyoung/' + file_name, index=False)