In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics



In [2]:
desktop_data=pd.read_csv('data/desktop.csv', na_values=['(NA)'],delimiter=';')
tablet_data=pd.read_csv('data/tablet.csv',na_values=['(NA)'],delimiter=';')

In [3]:
desktop_data

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,Male,No,Yes,7,10,10,0,10,1.0,0.0,...,0,0.000000,0.00,17,2,0,2,0.117647,0.000000,No
1,Female,Yes,Yes,13,12,12,0,12,1.0,0.0,...,4,0.114286,0.00,26,2,2,2,0.076923,0.076923,Yes
2,Female,No,Yes,7,6,6,0,6,1.0,0.0,...,4,0.114286,0.00,26,1,3,1,0.038462,0.115385,No
3,Female,No,Yes,7,0,0,0,0,0.0,0.0,...,0,0.000000,0.00,1,0,0,0,0.000000,0.000000,No
4,Female,No,Yes,8,4,4,0,4,1.0,0.0,...,1,25.000000,0.05,26,2,2,2,0.076923,0.076923,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,Male,No,No,10,7,7,0,7,1.0,0.0,...,2,0.670000,0.33,4,1,3,1,0.250000,0.750000,Yes
3640,Female,No,Yes,15,9,9,0,9,1.0,0.0,...,3,0.750000,0.25,4,2,2,2,0.500000,0.500000,No
3641,Female,No,Yes,15,11,11,0,11,1.0,0.0,...,3,0.600000,0.40,4,2,2,2,0.500000,0.500000,No
3642,Female,No,Yes,15,10,10,0,10,1.0,0.0,...,3,0.750000,0.25,4,3,1,3,0.750000,0.250000,No


In [4]:
def clean_data(data):
    # Convert columns to appropriate data types
    #convert categorical alues to numeric values
    data['Gender'] = data['Gender'].map({'Male': 1, 'Female': 2})
    data['Dyslexia'] = data['Dyslexia'].map({'No': 0, 'Yes': 1})
    data['Nativelang'] = data['Nativelang'].map({'No': 0, 'Yes': 1})
    data['Otherlang'] = data['Otherlang'].map({'No': 0, 'Yes': 1})
    
    data.fillna(data.mean(), inplace=True)

In [5]:
clean_data(desktop_data)
clean_data(tablet_data)

In [6]:
desktop_data

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,1,0,1,7,10,10,0,10,1.0,0.0,...,0,0.000000,0.00,17,2,0,2,0.117647,0.000000,0
1,2,1,1,13,12,12,0,12,1.0,0.0,...,4,0.114286,0.00,26,2,2,2,0.076923,0.076923,1
2,2,0,1,7,6,6,0,6,1.0,0.0,...,4,0.114286,0.00,26,1,3,1,0.038462,0.115385,0
3,2,0,1,7,0,0,0,0,0.0,0.0,...,0,0.000000,0.00,1,0,0,0,0.000000,0.000000,0
4,2,0,1,8,4,4,0,4,1.0,0.0,...,1,25.000000,0.05,26,2,2,2,0.076923,0.076923,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,1,0,0,10,7,7,0,7,1.0,0.0,...,2,0.670000,0.33,4,1,3,1,0.250000,0.750000,1
3640,2,0,1,15,9,9,0,9,1.0,0.0,...,3,0.750000,0.25,4,2,2,2,0.500000,0.500000,0
3641,2,0,1,15,11,11,0,11,1.0,0.0,...,3,0.600000,0.40,4,2,2,2,0.500000,0.500000,0
3642,2,0,1,15,10,10,0,10,1.0,0.0,...,3,0.750000,0.25,4,3,1,3,0.750000,0.250000,0


In [7]:
def get_common_columns():
    common_columns = ['Gender', 'Nativelang', 'Otherlang', 'Age', 'Dyslexia']
    for i in range(30):
        if (i >= 0 and i < 12) or (i >= 13 and i < 17) or i == 21 or i == 22 or i == 29:
            common_columns.extend([f'Clicks{i+1}', f'Hits{i+1}', f'Misses{i+1}', f'Score{i+1}', f'Accuracy{i+1}', f'Missrate{i+1}'])
    return common_columns

In [8]:
common_columns = get_common_columns()
reduced_desktop_data = desktop_data[common_columns]
reduced_tablet_data = tablet_data[common_columns]



In [9]:
reduced_desktop_data

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Dyslexia,Clicks1,Hits1,Misses1,Score1,Accuracy1,...,Misses23,Score23,Accuracy23,Missrate23,Clicks30,Hits30,Misses30,Score30,Accuracy30,Missrate30
0,1,0,1,7,0,10,10,0,10,1.0,...,1,3,0.750000,0.250000,14,1,2,1,0.071429,0.142857
1,2,1,1,13,1,12,12,0,12,1.0,...,1,5,0.833333,0.166667,17,2,2,2,0.117647,0.117647
2,2,0,1,7,0,6,6,0,6,1.0,...,2,3,0.600000,0.400000,17,1,3,1,0.058824,0.176471
3,2,0,1,7,0,0,0,0,0,0.0,...,1,3,0.750000,0.250000,8,0,1,0,0.000000,125.000000
4,2,0,1,8,0,4,4,0,4,1.0,...,3,2,0.400000,0.600000,17,0,4,0,0.000000,0.235294
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,1,0,0,10,1,7,7,0,7,1.0,...,2,1,0.330000,0.670000,4,0,4,0,0.000000,1.000000
3640,2,0,1,15,0,9,9,0,9,1.0,...,1,7,0.880000,0.130000,4,2,2,2,0.500000,0.500000
3641,2,0,1,15,0,11,11,0,11,1.0,...,2,6,0.750000,0.250000,4,0,4,0,0.000000,1.000000
3642,2,0,1,15,0,10,10,0,10,1.0,...,1,9,0.900000,0.100000,4,3,1,3,0.750000,0.250000


In [10]:
reduced_tablet_data

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Dyslexia,Clicks1,Hits1,Misses1,Score1,Accuracy1,...,Misses23,Score23,Accuracy23,Missrate23,Clicks30,Hits30,Misses30,Score30,Accuracy30,Missrate30
0,1,1,0,7,0,6,6,0,6,1.00,...,0.0,4.0,1.000000,0.000000,21.0,2.0,2.0,21.0,0.095238,0.095238
1,2,1,0,7,0,7,7,0,7,1.00,...,4.0,4.0,0.000000,1.000000,13.0,0.0,1.0,13.0,0.000000,0.076923
2,2,1,0,7,0,6,6,0,6,1.00,...,0.0,4.0,1.000000,0.000000,21.0,1.0,3.0,21.0,0.047619,0.142857
3,1,1,0,7,0,5,5,0,5,1.00,...,1.0,3.0,0.666667,0.333333,21.0,1.0,3.0,21.0,0.047619,0.142857
4,1,1,0,7,0,8,6,2,8,0.75,...,1.0,4.0,0.750000,0.250000,21.0,1.0,3.0,21.0,0.047619,0.142857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,1,1,0,17,0,13,13,0,13,1.00,...,0.0,7.0,1.000000,0.000000,21.0,4.0,0.0,21.0,0.190476,0.000000
1391,2,1,1,17,0,9,9,0,9,1.00,...,1.0,6.0,0.833333,0.166667,21.0,3.0,1.0,21.0,0.142857,0.047619
1392,1,1,1,17,0,10,10,0,10,1.00,...,0.0,8.0,1.000000,0.000000,21.0,3.0,1.0,21.0,0.142857,0.047619
1393,2,1,1,17,0,11,11,0,11,1.00,...,0.0,7.0,1.000000,0.000000,21.0,3.0,1.0,21.0,0.142857,0.047619


In [11]:
desktop_data.shape

(3644, 197)

In [12]:
desktop_data.columns

Index(['Gender', 'Nativelang', 'Otherlang', 'Age', 'Clicks1', 'Hits1',
       'Misses1', 'Score1', 'Accuracy1', 'Missrate1',
       ...
       'Score31', 'Accuracy31', 'Missrate31', 'Clicks32', 'Hits32', 'Misses32',
       'Score32', 'Accuracy32', 'Missrate32', 'Dyslexia'],
      dtype='object', length=197)

In [13]:
tablet_data.columns

Index(['Gender', 'Nativelang', 'Otherlang', 'Age', 'Clicks1', 'Hits1',
       'Misses1', 'Score1', 'Accuracy1', 'Missrate1',
       ...
       'Score31', 'Accuracy31', 'Missrate31', 'Clicks32', 'Hits32', 'Misses32',
       'Score32', 'Accuracy32', 'Missrate32', 'Dyslexia'],
      dtype='object', length=197)

In [14]:
# Split data into features (X) and target (y)
X_desktop = reduced_desktop_data.drop(columns=['Dyslexia'])
y_desktop = reduced_desktop_data['Dyslexia']
X_tablet = reduced_tablet_data.drop(columns=['Dyslexia'])
y_tablet = reduced_tablet_data['Dyslexia']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_desktop, y_desktop, test_size=0.33, random_state=42)


In [16]:
def train_and_evaluate_model(X_train, X_test, y_train, y_test):
    rfc = RandomForestClassifier()
    rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    return accuracy

In [17]:
desktop_accuracy = train_and_evaluate_model(X_train, X_test, y_train, y_test)
print("Accuracy on desktop data:", desktop_accuracy)


Accuracy on desktop data: 0.8877805486284289


In [18]:
tablet_accuracy = train_and_evaluate_model(X_desktop, X_tablet, y_desktop, y_tablet)
print("Accuracy on tablet data:", tablet_accuracy)

Accuracy on tablet data: 0.8939068100358423
