# CS 5665 - Introduction to Data Science

## Preprocessing

The data used in this project is from [this](https://www.kaggle.com/c/linking-writing-processes-to-writing-quality). 

### Introducing the dataset and packages

In [18]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [122]:
# Import dataset
train_df = pd.read_csv('../data/train_logs.csv')
train_scores = pd.read_csv('../data/train_scores.csv')
test_logs = pd.read_csv('../data/test_logs.csv')

In [132]:
train_df.head(10)

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1
5,001519c8,6,107296,107400,104,Input,q,q,q,3,1
6,001519c8,7,107469,107596,127,Input,q,q,q,4,1
7,001519c8,8,107659,107766,107,Input,q,q,q,5,1
8,001519c8,9,107743,107852,109,Input,q,q,q,6,1
9,001519c8,10,107840,107978,138,Input,Space,Space,,7,1


### Cleaning Data and Feature Engineering
Before we can split, we need to convert the keystroke measurements to the appropriate metrics for each user. Due to these metric types originally being event dependent, we will be dropping 'cursor_position', 'down_event', 'up_event', and 'text_change.' 

In [137]:
cleaned_data = train_df.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum() 
mins_data = train_df.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby("id").min()
maxes_data = train_df.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby("id").max()

cleaned_data['min_down_time'] = mins_data['down_time']
cleaned_data['max_up_time'] = maxes_data['up_time']
cleaned_data['min_action_time'] = mins_data['action_time']
cleaned_data['max_action_time'] = maxes_data['action_time']
cleaned_data = cleaned_data.merge(train_scores, on='id')


cleaned_test = test_logs.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum()
mins_test = test_logs.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby("id").min() 
maxes_test = test_logs.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby("id").max()

cleaned_test['min_down_time'] = mins_test['down_time']
cleaned_test['max_up_time'] = maxes_test['up_time']
cleaned_test['min_action_time'] = mins_test['action_time']
cleaned_test['max_action_time'] = maxes_test['action_time']
cleaned_test = cleaned_test.merge(train_scores, on='id')

cleaned_data.describe()

  cleaned_data = train_df.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum()
  cleaned_test = test_logs.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum()


Unnamed: 0,event_id,down_time,up_time,action_time,word_count,min_down_time,max_up_time,min_action_time,max_action_time,space_count,score
count,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,0.0,2471.0
mean,7033772.0,2699550000.0,2699883000.0,333667.5,787415.1,42393.87,1769537.0,0.116957,2989.163497,,3.711251
std,7095160.0,1727236000.0,1727350000.0,157520.2,729865.4,165828.4,276028.3,1.289195,13084.145614,,1.024937
min,34453.0,17238960.0,17256510.0,13452.0,5044.0,106.0,229548.0,0.0,153.0,,0.5
25%,2406818.0,1552195000.0,1552490000.0,211148.0,294527.5,2131.5,1754901.0,0.0,515.0,,3.0
50%,4750903.0,2315883000.0,2316161000.0,304951.0,562901.0,21149.0,1792315.0,0.0,1278.0,,4.0
75%,9251451.0,3441110000.0,3441537000.0,424814.0,1018560.0,45925.0,1807774.0,0.0,2648.5,,4.5
max,82902130.0,24244300000.0,24244940000.0,1210508.0,6546254.0,7452424.0,8313707.0,32.0,447470.0,,6.0


In order to make the model as simple and linear as possible, each essay's event IDs, down/up times, action times, word counts, and scores have been made into one table by combining the metrics by essay ID (some by max, min, or sum, which is functionally the same as mean in these cases). This will result in massive losses in precision capability but should still produce reasonably good results while being massively faster to work with.

### Preprocessing



In [21]:
# Identify numeric feature statistics
cleaned_data.info()
cleaned_data.describe(include='all')

# Checking for missing values
cleaned_data.describe(include="all").loc['count']
cleaned_data["action_time"].describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2471 entries, 0 to 2470
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               2471 non-null   object 
 1   event_id         2471 non-null   int64  
 2   down_time        2471 non-null   int64  
 3   up_time          2471 non-null   int64  
 4   action_time      2471 non-null   int64  
 5   word_count       2471 non-null   int64  
 6   min_down_time    2471 non-null   int64  
 7   max_up_time      2471 non-null   int64  
 8   min_action_time  2471 non-null   int64  
 9   max_action_time  2471 non-null   int64  
 10  score            2471 non-null   float64
dtypes: float64(1), int64(9), object(1)
memory usage: 231.7+ KB


count    2.471000e+03
mean     3.336675e+05
std      1.575202e+05
min      1.345200e+04
25%      2.111480e+05
50%      3.049510e+05
75%      4.248140e+05
max      1.210508e+06
Name: action_time, dtype: float64

In [63]:
drop_variables = ['id']
passthrough_variables = ['score']
scale_variables = ['event_id', 'down_time', 'up_time', 'action_time', 'word_count',
                   'min_down_time', 'max_up_time', 'min_action_time', 'max_action_time']

In [64]:
preprocessor = make_column_transformer(
    ('drop', drop_variables),
    ("passthrough", passthrough_variables),
    (StandardScaler(), scale_variables)
)

In [65]:
transformed = preprocessor.fit_transform(cleaned_data)
column_names = passthrough_variables + scale_variables

In [66]:
X_trained_transformed = pd.DataFrame(transformed, columns=column_names)
X_trained_transformed

Unnamed: 0,score,event_id,down_time,up_time,action_time,word_count,min_down_time,max_up_time,min_action_time,max_action_time
0,3.5,-0.530521,-0.307346,-0.307347,-0.231284,-0.630137,-0.228402,0.117518,-0.090739,-0.055817
1,3.5,-0.566908,-0.825925,-0.825904,-0.370037,-0.464609,-0.070997,0.070412,-0.090739,-0.094115
2,6.0,0.214492,0.421043,0.421065,0.555810,0.024893,-0.228915,0.007724,-0.090739,0.001211
3,2.0,-0.820785,-0.855492,-0.855519,-0.914808,-0.858118,-0.006025,-1.322843,-0.090739,-0.166889
4,4.0,-0.539847,-0.517724,-0.517701,-0.126774,-0.645222,0.217595,-0.387956,-0.090739,-0.174916
...,...,...,...,...,...,...,...,...,...,...
2466,3.5,0.591743,0.456797,0.456863,1.054063,0.585768,-0.120190,0.080123,-0.090739,0.025520
2467,4.0,-0.513420,-0.293632,-0.293682,-0.758447,-0.283245,-0.124623,0.107390,-0.090739,-0.141051
2468,1.5,-0.330046,0.616673,0.616573,-0.648223,-0.417585,-0.114068,0.687843,-0.090739,-0.185389
2469,5.0,-0.250485,-0.480910,-0.480904,-0.280837,-0.164217,-0.135763,-0.945867,-0.090739,-0.122399


### Splitting Data

In [67]:
# for run time consistency, set seed
np.random.seed(42)
train_df, test_df = train_test_split(X_trained_transformed, test_size=0.2, random_state=42)
y_train = train_df['score']
X_train = train_df.drop(columns='score')
y_test = test_df['score']
X_test = test_df.drop(columns='score')



### Dummy Submission

In [27]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_df.drop(columns=['score']), train_df['score'])

In [28]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_test.index
prediction ['score']= dummy_clf.predict(cleaned_test)
print(prediction.info())

prediction.to_csv('../data/prediction.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      0 non-null      object 
 1   score   0 non-null      float64
dtypes: float64(1), object(1)
memory usage: 0.0+ bytes
None


## Dummy Submission Checkpoint 2 (1)

In [29]:
dummy_clf_2_1 = DummyClassifier(strategy="uniform")
dummy_clf_2_1.fit(train_df.drop(columns=['score']), train_df['score'])

In [30]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_test.index
prediction ['score']= dummy_clf.predict(cleaned_test)

prediction.to_csv('../data/prediction_2_1.csv', index=False)

prediction

Unnamed: 0,id,score


## Dummy Submission Checkpoint 2 (2)

In [31]:
dummy_clf_2_2 = DummyClassifier(strategy="prior")
dummy_clf_2_2.fit(train_df.drop(columns=['score']), train_df['score'])

In [32]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_test.index
prediction['score'] = dummy_clf.predict(cleaned_test)

prediction.to_csv('../data/prediction_2_2.csv', index=False)

prediction

Unnamed: 0,id,score


# Model Selection
We will compare the efficiency of each of the following regression models:
- Ordinary Least Squares
- Ridge Regression
- Support Vector
- Decision Tree
- Random Forest

In [106]:
from sklearn import svm
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
models = {
    linear_model.LinearRegression(),
    linear_model.Ridge(),
    DecisionTreeRegressor(max_depth=15),
    RandomForestRegressor(n_estimators=50, max_depth=7, random_state=42),
    svm.SVR(),
    KNeighborsRegressor(n_neighbors=13),
    AdaBoostRegressor(random_state=42),
    GradientBoostingRegressor(),
}


In [107]:
print("MSE:")
for model in models:
    model.fit(X_train, y_train)
    print(type(model).__name__, 1 - model.score(X_test, y_test))

Ridge 0.5983881484888586
KNeighborsRegressor 0.526243120665214
GradientBoostingRegressor 0.4749293740233622
DecisionTreeRegressor 1.0753392078342814
RandomForestRegressor 0.4544798640913419
SVR 0.482053007288904
AdaBoostRegressor 0.5173005645207474
LinearRegression 0.5984132917496133


In [118]:
forests = [
        RandomForestRegressor(n_estimators=50, max_depth=7, random_state=42),
        RandomForestRegressor(n_estimators=60, max_depth=7, random_state=42),
        RandomForestRegressor(n_estimators=70, max_depth=7, random_state=42), #Best RFR so far
        RandomForestRegressor(n_estimators=80, max_depth=7, random_state=42),
        RandomForestRegressor(n_estimators=20, max_depth=7, random_state=42),
        RandomForestRegressor(n_estimators=25, max_depth=7, random_state=42),
        RandomForestRegressor(n_estimators=30, max_depth=7, random_state=42),
        RandomForestRegressor(n_estimators=40, max_depth=7, random_state=42),
]

In [120]:
print("MSE, lower is better")
for i in range(len(forests)):
    f = forests[i]
    f.fit(X_train, y_train)
    print(i, 1 - f.score(X_test, y_test))

MSE, lower is better
0 0.4543232772860202
1 0.4544449942671681
2 0.4537383120839711
3 0.45409134540232
4 0.4661741217305799
5 0.46521232275502866
6 0.4585330277949369
7 0.45657349260505287


A MSE of .454 when the scale is from 0 to 6 is not too bad considering how few features have been used, and especially since it was reduced from time-series to per-essay. 