In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import spearmanr,pearsonr, stats
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing


In [2]:
feature_path = "../SoR_Alberta.Shared.Data.and.Codebook.xlsx"#"../data/gr3/gr3_features.xlsx"
score_name = "G3.Gates.RC.raw"

show_graph = False
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G3.TOWRE.SWE.raw',
                 'G3.TOWRE.PDE.raw',
                 'G3.WordID.raw',
                 'G3.OL.Spell.Total',
                 'G3.OL.OrthoChoice.1.2.Total',
                 'G3.DigitSpan.raw',
                 'G3.Gates.RC.raw',
                 'G4.Gates.RC.raw',
                 'G5.Gates.RC.raw']

df = pd.read_excel(feature_path)


for col in feature_names:
    test_var = []
    score = []
    for i in df.index:
        if df[col][i] >= 0:
            test_var.append(df[col][i])
            score.append(df['G3.Gates.RC.raw'][i])
    
    print(col)
    print('Mean:', np.mean(test_var))
    print('Median:', np.median(test_var))
    print('Mode:', stats.mode(test_var)[0])
    print('Standard Deviation:', np.std(test_var))
    print('Variance:',np.var(test_var),'\n')
    print("Spearman's correlation", spearmanr(test_var, score)[0])
    print("Peterson's correlation", pearsonr(test_var, score)[0])
    
    if show_graph:
        # Graph histogram
        n, bins, patches = plt.hist(x=test_var, bins='auto', color='#0504aa',
                                alpha=0.7, rwidth=0.85)
        plt.grid(axis='y', alpha=0.75)
        plt.xlabel('Score')
        plt.ylabel('Frequency')
        plt.title(col)
        maxfreq = n.max()
        # Set a clean upper y-axis limit.
        plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
        plt.show()

        normalized = stats.zscore(test_var)
        n, bins, patches = plt.hist(x=normalized, bins='auto', color='#0504aa',
                                alpha=0.7, rwidth=0.85)
        plt.grid(axis='y', alpha=0.75)
        plt.xlabel('Score')
        plt.ylabel('Frequency')
        plt.title(col)
        maxfreq = n.max()
        # Set a clean upper y-axis limit.
        plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
        plt.show()
        print("-"*80)

G3.PPVT.Vocab.raw
Mean: 31.8705035971223
Median: 32.0
Mode: [33]
Standard Deviation: 4.9817429736732155
Variance: 24.817763055742454 

Spearman's correlation 0.4037221735829947
Peterson's correlation 0.41202237293021426
G3.Elision.PA.raw
Mean: 22.949640287769785
Median: 25.0
Mode: [26]
Standard Deviation: 6.101448985832974
Variance: 37.22767972672222 

Spearman's correlation 0.46828359536373737
Peterson's correlation 0.4390721662991758
G3.Syn.GramCorrect.raw
Mean: 9.565217391304348
Median: 10.0
Mode: [10]
Standard Deviation: 3.254776916906591
Variance: 10.593572778827976 

Spearman's correlation 0.6194247017421407
Peterson's correlation 0.6209022540675055
G3.TOWRE.SWE.raw
Mean: 56.64748201438849
Median: 59.0
Mode: [62]
Standard Deviation: 14.794935605099099
Variance: 218.89011955902905 

Spearman's correlation 0.7118902786600517
Peterson's correlation 0.6826267866489794
G3.TOWRE.PDE.raw
Mean: 24.294964028776977
Median: 24.0
Mode: [18]
Standard Deviation: 12.878208261256253
Variance: 16

## Linear regression

In [12]:
def lin_regress(feature_names,target_name):
    X = []
    y = []
    for i in df.index:
        unavailable = False
        for col in feature_names:
            if df[col][i] < 0:
                unavailable = True
                break

        if unavailable:
            continue
        new_entry = []
        for name in feature_names:
            new_entry.append(df[name][i])

        X.append(new_entry)
        y.append(df[target_name][i])

    X = np.asarray(X)
    y = np.asarray(y)
    print(X.shape)
    print(y.shape)
    
    X = preprocessing.normalize(X, norm='max',axis=0)
    
    reg = LinearRegression().fit(X, y)
    print('R squared score:', reg.score(X,y))

    print('betas:')
    for i, name in enumerate(feature_names):
        print(f"{name : <30}\t| {reg.coef_[i]}")



### G3 skill -> G3 score

In [13]:
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G3.TOWRE.SWE.raw',
                 'G3.TOWRE.PDE.raw',
                 'G3.WordID.raw',
                 'G3.OL.Spell.Total',
                 'G3.OL.OrthoChoice.1.2.Total',
                 'G3.DigitSpan.raw',]
target_name = 'G3.Gates.RC.raw'

lin_regress(feature_names,target_name)


(136, 9)
(136,)
R squared score: 0.6108535121020096
betas:
G3.PPVT.Vocab.raw             	| 14.461784749356575
G3.Elision.PA.raw             	| -13.205177234760757
G3.Syn.GramCorrect.raw        	| 14.96886308853719
G3.TOWRE.SWE.raw              	| 13.472330275948865
G3.TOWRE.PDE.raw              	| 11.342358924125712
G3.WordID.raw                 	| 13.87467332442043
G3.OL.Spell.Total             	| 5.207432731425861
G3.OL.OrthoChoice.1.2.Total   	| -4.094793822909369
G3.DigitSpan.raw              	| -0.9701382571421564


### G3 skill, score -> G4 score

In [28]:
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G4.TOWRE.SWE.raw',
                 'G4.TOWRE.PDE.raw',
                 'G4.WordID.raw',
                 'G3.OL.Spell.Total',
                 'G3.OL.OrthoChoice.1.2.Total',
                 'G3.DigitSpan.raw',
                 'G3.Gates.RC.raw']

# feature_names = [
#                  'G3.PPVT.Vocab.raw',
#                  'G3.DigitSpan.raw',
#                  'G4.WordID.raw',
#                  'G3.Gates.RC.raw'
#                  ]

target_name = 'G4.Gates.RC.raw'

lin_regress(feature_names,target_name)

(121, 10)
(121,)
R squared score: 0.6746618527472914
betas:
G3.PPVT.Vocab.raw             	| 14.153147678996383
G3.Elision.PA.raw             	| -2.1531565812785045
G3.Syn.GramCorrect.raw        	| 1.8450342535249065
G4.TOWRE.SWE.raw              	| 6.283483694025226
G4.TOWRE.PDE.raw              	| 6.146615781795408
G4.WordID.raw                 	| 22.18495012765397
G3.OL.Spell.Total             	| 4.949416924502472
G3.OL.OrthoChoice.1.2.Total   	| -8.406110642015168
G3.DigitSpan.raw              	| 9.446877905747879
G3.Gates.RC.raw               	| 9.808129531521855


### G3 skill, score, G4 skill -> G4 score

In [18]:
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G4.TOWRE.SWE.raw',
                 'G4.TOWRE.PDE.raw',
                 'G4.WordID.raw',
                 'G3.OL.Spell.Total',
                 'G3.OL.OrthoChoice.1.2.Total',
                 'G3.DigitSpan.raw',
                 'G3.Gates.RC.raw']
# feature_names = [
#                  'G3.PPVT.Vocab.raw',
#                  'G3.DigitSpan.raw',
#                  'G4.WordID.raw',
#                  'G3.Gates.RC.raw'
#                  ]
target_name = 'G4.Gates.RC.raw'

lin_regress(feature_names,target_name)


(124, 4)
(124,)
R squared score: 0.6486181954591843
betas:
G3.PPVT.Vocab.raw             	| 13.139814281201026
G3.DigitSpan.raw              	| 7.772500029111724
G4.WordID.raw                 	| 33.09758860578388
G3.Gates.RC.raw               	| 12.731319262106453


In [25]:
feature_names = ['G3.PPVT.Vocab.raw',
                 'G3.Elision.PA.raw',
                 'G3.Syn.GramCorrect.raw',
                 'G5.TOWRE.SWE.raw',
                 'G5.TOWRE.PDE.raw',
                 'G5.WordID.raw',
                 'G3.OL.Spell.Total',
                 'G3.OL.OrthoChoice.1.2.Total',
                 'G3.DigitSpan.raw',
                 'G4.Gates.RC.raw']
target_name = 'G5.Gates.RC.raw'

lin_regress(feature_names,target_name)

(106, 10)
(106,)
R squared score: 0.700983713554834
betas:
G3.PPVT.Vocab.raw             	| 6.528764414541382
G3.Elision.PA.raw             	| -9.461589790834024
G3.Syn.GramCorrect.raw        	| 5.837470568025636
G5.TOWRE.SWE.raw              	| 2.7984397333267954
G5.TOWRE.PDE.raw              	| 1.505274555478004
G5.WordID.raw                 	| 16.182367081780033
G3.OL.Spell.Total             	| 3.2938588277735357
G3.OL.OrthoChoice.1.2.Total   	| -2.5597020005785422
G3.DigitSpan.raw              	| -2.1585582817535345
G4.Gates.RC.raw               	| 24.89173539628724
