# CS 5665 - Introduction to Data Science

## Preprocessing

The data used in this project is from [this](https://www.kaggle.com/c/linking-writing-processes-to-writing-quality). 

### Introducing the dataset and packages

In [130]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


In [131]:
# Import dataset
train_df = pd.read_csv('../data/train_logs.csv')
train_scores = pd.read_csv('../data/train_scores.csv')
test_logs = pd.read_csv('../data/test_logs.csv')

### Cleaning Data
Before we can split, we need to convert the keystroke measurements to the appropriate metrics for each user. Due to these metric types originally being event dependent, we will be dropping 'cursor_position', 'down_event', 'up_event', and 'text_change.' 

In [132]:
cleaned_train = train_df.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum()
cleaned_train = cleaned_train.merge(train_scores, on='id')

cleaned_prediction_test = test_logs.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum().reset_index()

  cleaned_train = train_df.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum()
  cleaned_prediction_test = test_logs.drop(columns=['cursor_position','down_event', 'up_event','text_change']).groupby('id').sum().reset_index()


### Preprocessing



In [133]:
# Identify numeric feature statistics
cleaned_train.info()
cleaned_train.describe(include='all')

# Checking for missing values
cleaned_train.describe(include="all").loc['count']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2471 entries, 0 to 2470
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           2471 non-null   object 
 1   event_id     2471 non-null   int64  
 2   down_time    2471 non-null   int64  
 3   up_time      2471 non-null   int64  
 4   action_time  2471 non-null   int64  
 5   word_count   2471 non-null   int64  
 6   score        2471 non-null   float64
dtypes: float64(1), int64(5), object(1)
memory usage: 154.4+ KB


id               2471
event_id       2471.0
down_time      2471.0
up_time        2471.0
action_time    2471.0
word_count     2471.0
score          2471.0
Name: count, dtype: object

In [134]:
passthrough_variables = ['id', 'score']
scale_variables = ['event_id', 'down_time', 'up_time', 'action_time', 'word_count']

In [135]:
preprocessor = make_column_transformer(
    ("passthrough", passthrough_variables),
    (StandardScaler(), scale_variables)
)

In [136]:
transformed = preprocessor.fit_transform(cleaned_train)
column_names = passthrough_variables + scale_variables

In [137]:
X_trained_transformed = pd.DataFrame(transformed, columns=column_names)
X_trained_transformed

Unnamed: 0,id,score,event_id,down_time,up_time,action_time,word_count
0,001519c8,3.5,-0.530521,-0.307346,-0.307347,-0.231284,-0.630137
1,0022f953,3.5,-0.566908,-0.825925,-0.825904,-0.370037,-0.464609
2,0042269b,6.0,0.214492,0.421043,0.421065,0.55581,0.024893
3,0059420b,2.0,-0.820785,-0.855492,-0.855519,-0.914808,-0.858118
4,0075873a,4.0,-0.539847,-0.517724,-0.517701,-0.126774,-0.645222
...,...,...,...,...,...,...,...
2466,ffb8c745,3.5,0.591743,0.456797,0.456863,1.054063,0.585768
2467,ffbef7e5,4.0,-0.51342,-0.293632,-0.293682,-0.758447,-0.283245
2468,ffccd6fd,1.5,-0.330046,0.616673,0.616573,-0.648223,-0.417585
2469,ffec5b38,5.0,-0.250485,-0.48091,-0.480904,-0.280837,-0.164217


### Splitting Data

In [138]:
# for run time consistency, set seed
np.random.seed(42)
train_df, test_df = train_test_split(X_trained_transformed, test_size=0.2, random_state=42)


In [139]:
# Scaling prediction test data
passthrough_variables = ['id']
scale_variables = ['event_id', 'down_time', 'up_time', 'action_time', 'word_count']

preprocessor = make_column_transformer(
    ("passthrough", passthrough_variables),
    (StandardScaler(), scale_variables)
)

transformed = preprocessor.fit_transform(cleaned_prediction_test)
column_names = passthrough_variables + scale_variables

prediction_test_transformed = pd.DataFrame(transformed, columns=column_names)

### Dummy Submission

In [140]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_df.drop(columns=['score']), train_df['score'])

In [141]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_prediction_test.index
prediction ['score']= dummy_clf.predict(cleaned_prediction_test)
print(prediction.info())

prediction.to_csv('../data/prediction.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      3 non-null      int64  
 1   score   3 non-null      float64
dtypes: float64(1), int64(1)
memory usage: 180.0 bytes
None


## Non-Dummy Submission Checkpoint 2 (1)

In [142]:
clf_2_1 = LinearRegression()
clf_2_1.fit(train_df.drop(columns=['score', 'id']), train_df['score'])

In [143]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = prediction_test_transformed['id']
prediction ['score'] = clf_2_1.predict(prediction_test_transformed.drop(columns=['id']))

prediction.to_csv('../data/prediction_2_1.csv', index=False)

prediction

Unnamed: 0,id,score
0,0000aaaa,3.481461
1,2222bbbb,3.907209
2,4444cccc,3.731804


## Dummy Submission Checkpoint 2 (2)

In [144]:
clf_2_2 = RandomForestRegressor(random_state=48)
clf_2_2.fit(train_df.drop(columns=['score', 'id']), train_df['score'])

In [145]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = prediction_test_transformed['id']
prediction ['score'] = clf_2_2.predict(prediction_test_transformed.drop(columns=['id']))

prediction.to_csv('../data/prediction_2_2.csv', index=False)

prediction

Unnamed: 0,id,score
0,0000aaaa,2.28
1,2222bbbb,4.5
2,4444cccc,4.27
