In [1]:
import pandas as pd
import numpy as np
import csv
from sklearn.model_selection import train_test_split
from collections import defaultdict
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [2]:
data= pd.read_csv("./Dyt-desktop.csv", index_col=0, na_values=['(NA)'])
testData = pd.read_csv("./Dyt-tablet.csv", index_col=0, na_values=['(NA)'])

In [3]:
#A function to separate data columns
def SeparateColumns(dataSetName):
    columns = defaultdict(list)
    with open(dataSetName, 'r') as f:
        reader = csv.reader(f, delimiter=';')
        headers = next(reader)
        column_nums = range(len(headers)) # Do NOT change to xrange
        for row in reader:
            for i in column_nums:
            
                columns[headers[i]].append(row[i])
    # Following line is only necessary if you want a key error for invalid column names
    return dict(columns)

In [4]:
def cleanData(data) :
    for col in data.columns.values:
        data[col] = data[col].astype('string')
    #----------
    for col in data.columns.values:
        data[col] = data[col].astype('float',errors = 'ignore')
    #-----------
    data['Gender']=data.Gender.map({'Male': 1, 'Female': 2})
    data['Dyslexia']=data.Dyslexia.map({'No': 0, 'Yes': 1})
    data['Nativelang']=data.Nativelang.map({'No': 0, 'Yes': 1})
    data['Otherlang']=data.Otherlang.map({'No': 0, 'Yes': 1})

In [5]:
columns = SeparateColumns('Dyt-desktop.csv')
data=pd.DataFrame.from_dict(columns)

data

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,Male,No,Yes,7,10,10,0,10,1,0,...,0,0,0,17,2,0,2,0.117647,0,No
1,Female,Yes,Yes,13,12,12,0,12,1,0,...,4,0.114286,0,26,2,2,2,0.0769231,0.0769231,Yes
2,Female,No,Yes,7,6,6,0,6,1,0,...,4,0.114286,0,26,1,3,1,0.0384615,0.115385,No
3,Female,No,Yes,7,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,No
4,Female,No,Yes,8,4,4,0,4,1,0,...,1,25,0.05,26,2,2,2,0.0769231,0.0769231,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3639,Male,No,No,10,7,7,0,7,1,0,...,2,0.67,0.33,4,1,3,1,0.25,0.75,Yes
3640,Female,No,Yes,15,9,9,0,9,1,0,...,3,0.75,0.25,4,2,2,2,0.5,0.5,No
3641,Female,No,Yes,15,11,11,0,11,1,0,...,3,0.6,0.4,4,2,2,2,0.5,0.5,No
3642,Female,No,Yes,15,10,10,0,10,1,0,...,3,0.75,0.25,4,3,1,3,0.75,0.25,No


In [6]:
data.head()

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,Male,No,Yes,7,10,10,0,10,1,0,...,0,0.0,0.0,17,2,0,2,0.117647,0.0,No
1,Female,Yes,Yes,13,12,12,0,12,1,0,...,4,0.114286,0.0,26,2,2,2,0.0769231,0.0769231,Yes
2,Female,No,Yes,7,6,6,0,6,1,0,...,4,0.114286,0.0,26,1,3,1,0.0384615,0.115385,No
3,Female,No,Yes,7,0,0,0,0,0,0,...,0,0.0,0.0,1,0,0,0,0.0,0.0,No
4,Female,No,Yes,8,4,4,0,4,1,0,...,1,25.0,0.05,26,2,2,2,0.0769231,0.0769231,No


In [7]:
cleanData(data)

data['Clicks29']

0       4.0
1       5.0
2       5.0
3       1.0
4       4.0
       ... 
3639    2.0
3640    2.0
3641    2.0
3642    2.0
3643    2.0
Name: Clicks29, Length: 3644, dtype: float64

In [8]:
#In the tablet dataset, the data related to question 29 is not recorded, 
#I deleted the columns related to question 29 from both datasets.
cols_with_missing = ['Clicks29', 'Hits29', 'Misses29', 'Score29', 'Accuracy29', 'Missrate29']
# Drop columns data
reduced_data = data.drop(cols_with_missing, axis=1)

In [9]:
#This dataset is a collection of data of people with different age ranges
#For more accuracy, according to the "result" section of the article, 
#I found the commonality of questions that are suitable for all age ranges and used only these questions for my model.
#Columns related to questions 1 to 12 / 14to17 /22 / 23 /30
temp = ['Gender','Nativelang','Otherlang','Age' , 'Dyslexia']
for i in  range(30):
    if((i>=0 and i<12) or (i>=13 and i<17) or i==21 or i==22 or i==29):
        temp.append('Clicks'+str(i+1))
        temp.append('Hits'+str(i+1))
        temp.append('Misses'+str(i+1))
        temp.append('Score'+str(i+1))
        temp.append('Accuracy'+str(i+1))
        temp.append('Missrate'+str(i+1))
    
reduced_data=reduced_data.loc[:,temp]


In [10]:
y=reduced_data['Dyslexia']
X=reduced_data.loc[:, reduced_data.columns != 'Dyslexia']

In [11]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
X_train = sc_x.fit_transform(X_train)
X_test = sc_x.transform(X_test)

In [13]:
#----RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train , y_train)
y_pred = rfc.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8902743142144638


In [14]:
columns = SeparateColumns('Dyt-tablet.csv')
testData=pd.DataFrame.from_dict(columns)
testData.replace(["NULL"], np.nan, inplace = True)

testData

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,Male,Yes,No,7,6,6,0,6,1,0,...,,,,,,,,,,No
1,Female,Yes,No,7,7,7,0,7,1,0,...,,,,,,,,,,No
2,Female,Yes,No,7,6,6,0,6,1,0,...,,,,,,,,,,No
3,Male,Yes,No,7,5,5,0,5,1,0,...,,,,,,,,,,No
4,Male,Yes,No,7,8,6,2,8,0.75,0.25,...,,,,,,,,,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,Male,Yes,No,17,13,13,0,13,1,0,...,35,0.11428571428571,0,26,4,0,26,0.15384615384615,0,No
1391,Female,Yes,Yes,17,9,9,0,9,1,0,...,35,0.11428571428571,0,26,4,0,26,0.15384615384615,0,No
1392,Male,Yes,Yes,17,10,10,0,10,1,0,...,35,0.11428571428571,0,27,3,2,27,0.11111111111111,0.074074074074074,No
1393,Female,Yes,Yes,17,11,11,0,11,1,0,...,35,0.11428571428571,0,26,4,0,26,0.15384615384615,0,No


In [15]:
cleanData(testData)
testData

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,1,1,0,7.0,6.0,6.0,0.0,6.0,1.00,0.00,...,,,,,,,,,,0
1,2,1,0,7.0,7.0,7.0,0.0,7.0,1.00,0.00,...,,,,,,,,,,0
2,2,1,0,7.0,6.0,6.0,0.0,6.0,1.00,0.00,...,,,,,,,,,,0
3,1,1,0,7.0,5.0,5.0,0.0,5.0,1.00,0.00,...,,,,,,,,,,0
4,1,1,0,7.0,8.0,6.0,2.0,8.0,0.75,0.25,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,1,1,0,17.0,13.0,13.0,0.0,13.0,1.00,0.00,...,35.0,0.114286,0.0,26.0,4.0,0.0,26.0,0.153846,0.000000,0
1391,2,1,1,17.0,9.0,9.0,0.0,9.0,1.00,0.00,...,35.0,0.114286,0.0,26.0,4.0,0.0,26.0,0.153846,0.000000,0
1392,1,1,1,17.0,10.0,10.0,0.0,10.0,1.00,0.00,...,35.0,0.114286,0.0,27.0,3.0,2.0,27.0,0.111111,0.074074,0
1393,2,1,1,17.0,11.0,11.0,0.0,11.0,1.00,0.00,...,35.0,0.114286,0.0,26.0,4.0,0.0,26.0,0.153846,0.000000,0


In [16]:
stateOfNUll= testData.isnull().any()
i = 0
for state in stateOfNUll : 
    if(state):  
        testData[stateOfNUll.index[i]].fillna(round(testData[stateOfNUll.index[i]].mean() , 4), inplace=True)
    i = i + 1    

testData

Unnamed: 0,Gender,Nativelang,Otherlang,Age,Clicks1,Hits1,Misses1,Score1,Accuracy1,Missrate1,...,Score31,Accuracy31,Missrate31,Clicks32,Hits32,Misses32,Score32,Accuracy32,Missrate32,Dyslexia
0,1,1,0,7.0,6.0,6.0,0.0,6.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
1,2,1,0,7.0,7.0,7.0,0.0,7.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
2,2,1,0,7.0,6.0,6.0,0.0,6.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
3,1,1,0,7.0,5.0,5.0,0.0,5.0,1.00,0.00,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
4,1,1,0,7.0,8.0,6.0,2.0,8.0,0.75,0.25,...,46.8333,0.386300,0.5439,52.51,2.7851,7.9719,52.51,0.970900,2.225400,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1390,1,1,0,17.0,13.0,13.0,0.0,13.0,1.00,0.00,...,35.0000,0.114286,0.0000,26.00,4.0000,0.0000,26.00,0.153846,0.000000,0
1391,2,1,1,17.0,9.0,9.0,0.0,9.0,1.00,0.00,...,35.0000,0.114286,0.0000,26.00,4.0000,0.0000,26.00,0.153846,0.000000,0
1392,1,1,1,17.0,10.0,10.0,0.0,10.0,1.00,0.00,...,35.0000,0.114286,0.0000,27.00,3.0000,2.0000,27.00,0.111111,0.074074,0
1393,2,1,1,17.0,11.0,11.0,0.0,11.0,1.00,0.00,...,35.0000,0.114286,0.0000,26.00,4.0000,0.0000,26.00,0.153846,0.000000,0


In [17]:
# Get names of columns with missing values
#['Clicks29', 'Hits29', 'Misses29', 'Score29', 'Accuracy29', 'Missrate29']
cols_with_missing = [col for col in testData.columns if testData[col].isnull().any()]
# Drop columns data
reduced_testData = testData.drop(cols_with_missing, axis=1)

In [18]:
# temp = ['Gender','Nativelang','Otherlang','Age' , 'Dyslexia']
# for i in  range(30):
#     if((i>=0 and i<12) or (i>=13 and i<17) or i==21 or i==22 or i==29):
#         temp.append('Clicks'+str(i+1))
#         temp.append('Hits'+str(i+1))
#         temp.append('Misses'+str(i+1))
#         temp.append('Score'+str(i+1))
#         temp.append('Accuracy'+str(i+1))
#         temp.append('Missrate'+str(i+1))
    
reduced_testData=reduced_testData.loc[:,temp]

In [19]:
#replace values of empty columns with 0
# testData['Clicks29'].replace(np.nan, 0, inplace = True)
# testData['Hits29'].replace(np.nan, 0, inplace = True)
# testData['Misses29'].replace(np.nan, 0, inplace = True)
# testData['Score29'].replace(np.nan, 0, inplace = True)
# testData['Accuracy29'].replace(np.nan, 0, inplace = True)
# testData['Missrate29'].replace(np.nan, 0, inplace = True)

# #print any columns with null value
# stateOfNUll= testData.isnull().any()
# t = 0
# for state in stateOfNUll : 
#     if(state) : 
#         print(stateOfNUll.index[t])
#     t = t+1      


In [20]:
yTest=reduced_testData['Dyslexia']
XTest=reduced_testData.loc[:, reduced_testData.columns != 'Dyslexia']

In [21]:
#normalaiz
XTest = sc_x.transform(XTest)

In [22]:
rfc2 = RandomForestClassifier()
rfc2.fit(X_train , y_train)
y_pred = rfc2.predict(XTest)
print("Accuracy:",metrics.accuracy_score(yTest, y_pred))

Accuracy: 0.8939068100358423
