# CS 5665 - Introduction to Data Science

## Preprocessing

The data used in this project is from [this](https://www.kaggle.com/c/linking-writing-processes-to-writing-quality). 

### Introducing the dataset and packages

In [11]:
# Import packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, cross_validate, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [12]:
# Import dataset
train_df = pd.read_csv('../data/train_logs.csv')
train_scores = pd.read_csv('../data/train_scores.csv')
test_logs = pd.read_csv('../data/test_logs.csv')

### Cleaning Data
Before we can split, we need to convert the keystroke measurements to the appropriate metrics for each user. Due to these metric types originally being event dependent, we will be dropping 'cursor_position', 'down_event', 'up_event', and 'text_change.' 

In [15]:
cleaned_data = train_df.drop(columns=["cursor_position"]).groupby('id').mean()
cleaned_data = cleaned_data.merge(train_scores, on='id')

cleaned_test = test_logs.drop(columns=['cursor_position']).groupby('id').mean()

  cleaned_data = train_df.drop(columns=["cursor_position"]).groupby('id').mean()
  cleaned_test = test_logs.drop(columns=['cursor_position']).groupby('id').mean()


### Preprocessing



In [20]:
# Identify numeric feature statistics
cleaned_data.info()
cleaned_data.describe(include='all')

# Checking for missing values
cleaned_data.describe(include="all").loc['count']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2471 entries, 0 to 2470
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           2471 non-null   object 
 1   event_id     2471 non-null   float64
 2   down_time    2471 non-null   float64
 3   up_time      2471 non-null   float64
 4   action_time  2471 non-null   float64
 5   word_count   2471 non-null   float64
 6   score        2471 non-null   float64
dtypes: float64(6), object(1)
memory usage: 154.4+ KB


id               2471
event_id       2471.0
down_time      2471.0
up_time        2471.0
action_time    2471.0
word_count     2471.0
score          2471.0
Name: count, dtype: object

In [21]:
passthrough_variables = ['id']
scale_variables = ['event_id', 'down_time', 'up_time', 'action_time', 'word_count', 'score']

In [22]:
preprocessor = make_column_transformer(
    ("passthrough", passthrough_variables),
    (StandardScaler(), scale_variables)
)

In [24]:
transformed = preprocessor.fit_transform(cleaned_data)
column_names = passthrough_variables + scale_variables

In [25]:
X_trained_transformed = pd.DataFrame(transformed, columns=column_names)
X_trained_transformed

Unnamed: 0,id,event_id,down_time,up_time,action_time,word_count,score
0,001519c8,-0.535194,0.271578,0.271636,0.675947,-0.813732,-0.206152
1,0022f953,-0.600445,-0.886269,-0.886231,0.508968,-0.184298,-0.206152
2,0042269b,0.465103,0.202355,0.202362,0.078257,-0.04529,2.233515
3,0059420b,-1.169328,0.051144,0.051221,0.908302,-1.096145,-1.669952
4,0075873a,-0.551665,-0.202448,-0.202364,0.995227,-0.848699,0.281781
...,...,...,...,...,...,...,...
2466,ffb8c745,0.847104,-0.122471,-0.122452,0.22759,0.664639,-0.206152
2467,ffbef7e5,-0.50542,0.249728,0.249668,-0.733582,0.280276,0.281781
2468,ffccd6fd,-0.214643,1.61052,1.610443,-1.009862,-0.473954,-2.157886
2469,ffec5b38,-0.101246,-0.683536,-0.683577,-0.44273,0.083186,1.257648


### Splitting Data

In [26]:
# for run time consistency, set seed
np.random.seed(42)
train_df, test_df = train_test_split(X_trained_transformed, test_size=0.2, random_state=42)


### Dummy Submission

In [27]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(train_df.drop(columns=['score']), train_df['score'])

In [28]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_test.index
prediction ['score']= dummy_clf.predict(cleaned_test)
print(prediction.info())

prediction.to_csv('../data/prediction.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      3 non-null      object 
 1   score   3 non-null      float64
dtypes: float64(1), object(1)
memory usage: 180.0+ bytes
None


## Dummy Submission Checkpoint 2 (1)

In [29]:
dummy_clf_2_1 = DummyClassifier(strategy="uniform")
dummy_clf_2_1.fit(train_df.drop(columns=['score']), train_df['score'])

In [37]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_test.index
prediction ['score']= dummy_clf.predict(cleaned_test)

prediction.to_csv('../data/prediction_2_1.csv', index=False)

prediction

Unnamed: 0,id,score
0,0000aaaa,0.281781
1,2222bbbb,0.281781
2,4444cccc,0.281781


## Dummy Submission Checkpoint 2 (2)

In [38]:
dummy_clf_2_2 = DummyClassifier(strategy="prior")
dummy_clf_2_2.fit(train_df.drop(columns=['score']), train_df['score'])

In [40]:
# Predict
prediction = pd.DataFrame()
prediction['id'] = cleaned_test.index
prediction ['score']= dummy_clf.predict(cleaned_test)

prediction.to_csv('../data/prediction_2_2.csv', index=False)

prediction

Unnamed: 0,id,score
0,0000aaaa,0.281781
1,2222bbbb,0.281781
2,4444cccc,0.281781
