NOTES/TODO:
1. Standardize essay sets (each essay set should have mean 0 and st.dev of 1)
2. Which crieterion do we use for evaluating our classifier?  I vote kappa.

In [26]:
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

# TODO if needed, include the words and stopwords imports
# HOWEVER, to use them, you will need to download nltk stuff first if not done already
# To do so, open a python shell (i.e. go to terminal and enter python), and then type
#
# import nltk
# nltk.download()

# After this, select the words and stopwords corpuses, and download them

#import nltk
#from nltk.corpus import stopwords
#from nltk.corpus import words

# Regular expressions might be useful
import re

# Beautiful soup might be useful
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [27]:
# Read in training data
# Note that for essay set 2, score becomes average of 2 domain scores
train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score']
train_df = pd.read_csv('data/training_set_rel3.tsv', delimiter='\t', usecols=train_cols,
                       converters={'domain1_score': lambda x: float(x)})
for i in xrange(train_df.shape[0]):
    if not np.isnan(train_df.get_value(i, 'domain2_score')):
        assert train_df.get_value(i, 'essay_set') == 2
        new_val = (train_df.get_value(i, 'domain1_score') + train_df.get_value(i, 'domain2_score')) / 2.0
        train_df.set_value(i, 'domain1_score', new_val) 
train_df = train_df.drop('domain2_score', axis=1)
train_df = train_df.rename(columns={'domain1_score': 'score'})
print train_df.head()

   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   score  
0    8.0  
1    9.0  
2    7.0  
3   10.0  
4    8.0  


In [28]:
# Show nothing is empty in training set
if train_df.isnull().any().any():
    print 'Data is missing!'
else:
    print 'No missing data!'

No missing data!


In [29]:
# Read in validation data
valid_cols = ['essay_id', 'essay_set', 'essay', 'domain1_predictionid', 'domain2_predictionid']
valid_df = pd.read_csv('data/valid_set.tsv', delimiter='\t', usecols=valid_cols)
valid_df['score'] = pd.Series([0.0] * valid_df.shape[0], index=valid_df.index)

# scores are stored in separate data set, we'll put them in same one
valid_scores = pd.read_csv('data/valid_sample_submission_5_column.csv', delimiter=',',
                          converters={'predicted_score': lambda x: float(x)})

# put each score in our data set, and make sure to handle essay set 2
for i in xrange(valid_df.shape[0]):
    dom1_predid = valid_df.get_value(i, 'domain1_predictionid')
    row = valid_scores[valid_scores['prediction_id'] == dom1_predid]
    score = row.get_value(row.index[0], 'predicted_score')
    
    dom2_predid = valid_df.get_value(i, 'domain2_predictionid')
    if not np.isnan(dom2_predid):
        assert valid_df.get_value(i, 'essay_set') == 2
        rowB = valid_scores[valid_scores['prediction_id'] == dom2_predid]
        scoreB = rowB.get_value(rowB.index[0], 'predicted_score')
        score = (score + scoreB) / 2.0
        
    valid_df.set_value(i, 'score', score)
        
valid_df = valid_df.drop(['domain1_predictionid', 'domain2_predictionid'], axis=1)
#print valid_df[valid_df['essay_set'] == 2].head()
print valid_df.head()

   essay_id  essay_set                                              essay  \
0      1788          1  Dear @ORGANIZATION1, @CAPS1 more and more peop...   
1      1789          1  Dear @LOCATION1 Time @CAPS1 me tell you what I...   
2      1790          1  Dear Local newspaper, Have you been spending a...   
3      1791          1  Dear Readers, @CAPS1 you imagine how life woul...   
4      1792          1  Dear newspaper, I strongly believe that comput...   

   score  
0    7.0  
1    8.0  
2    9.0  
3    9.0  
4    9.0  


In [30]:
# Show nothing is empty in validation set
if valid_df.isnull().any().any():
    print 'Data is missing!'
else:
    print 'No missing data!'

No missing data!


In [31]:
# returned a copy of old_df, with essays cleaned for count vectorizer
def vectorizer_clean(old_df):
    new_df = old_df.copy()
    for i in xrange(new_df.shape[0]):
        new_df.set_value(i, 'essay', " ".join(re.sub('[^a-zA-Z\d\s]', '', new_df['essay'].iloc[i]).lower().split())) 
    return new_df

In [32]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_train = vectorizer_clean(train_df)
print vectorizer_train.head()

   essay_id  essay_set                                              essay  \
0         1          1  dear local newspaper i think effects computers...   
1         2          1  dear caps1 caps2 i believe that using computer...   
2         3          1  dear caps1 caps2 caps3 more and more people us...   
3         4          1  dear local newspaper caps1 i have found that m...   
4         5          1  dear location1 i know having computers has a p...   

   score  
0    8.0  
1    9.0  
2    7.0  
3   10.0  
4    8.0  


In [33]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_valid = vectorizer_clean(valid_df)
print vectorizer_valid.head()

   essay_id  essay_set                                              essay  \
0      1788          1  dear organization1 caps1 more and more people ...   
1      1789          1  dear location1 time caps1 me tell you what i t...   
2      1790          1  dear local newspaper have you been spending a ...   
3      1791          1  dear readers caps1 you imagine how life would ...   
4      1792          1  dear newspaper i strongly believe that compute...   

   score  
0    7.0  
1    8.0  
2    9.0  
3    9.0  
4    9.0  
