NOTES/TODO:
1. How do we deal with standardized scores in model evaluation?
2. Which criterion do we use for evaluating our classifier? (Quadratic weighted kappa or spearman's correlation)

In [76]:

import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

# TODO if needed, include the words and stopwords imports
# HOWEVER, to use them, you will need to download nltk stuff first if not done already
# To do so, open a python shell (i.e. go to terminal and enter python), and then type
#
# import nltk
# nltk.download()

# After this, select the words and stopwords corpuses, and download them

#import nltk
#from nltk.corpus import stopwords
#from nltk.corpus import words

# Regular expressions might be useful
import re

# Beautiful soup might be useful
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

# for modeling
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import LogisticRegressionCV as LogRegCV
from sklearn.cross_validation import cross_val_predict 
from sklearn import cross_validation
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score
%matplotlib inline


In [77]:
def append_regularized_scores(old_df):
    new_df = old_df.copy()
    new_df['std_score'] = new_df.groupby(['essay_set'])[['score']].apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
    return new_df

In [82]:
# Read in training data
# Note that for essay set 2, score becomes average of 2 domain scores
train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score']
train_df = pd.read_csv('data/training_set_rel3.tsv', delimiter='\t', usecols=train_cols)
for i in xrange(train_df.shape[0]):
    if not np.isnan(train_df.get_value(i, 'domain2_score')):
        assert train_df.get_value(i, 'essay_set') == 2
        new_val = train_df.get_value(i, 'domain1_score') + train_df.get_value(i, 'domain2_score')
        train_df.set_value(i, 'domain1_score', new_val) 
train_df = train_df.drop('domain2_score', axis=1)
train_df = train_df.rename(columns={'domain1_score': 'score'})
train_df = append_regularized_scores(train_df)
#print train_df[train_df['essay_set'] == 2].head()
print train_df.head()

   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   score  std_score  
0      8  -0.052832  
1      9   0.047168  
2      7  -0.152832  
3     10   0.147168  
4      8  -0.052832  


In [83]:
# Show nothing is empty in training set
if train_df.isnull().any().any():
    print 'Training data is missing!'
else:
    print 'No missing training data!'

No missing training data!


In [86]:
# Read in validation data
valid_cols = ['essay_id', 'essay_set', 'essay', 'domain1_predictionid', 'domain2_predictionid']
valid_df = pd.read_csv('data/valid_set.tsv', delimiter='\t', usecols=valid_cols)
valid_df['score'] = pd.Series([0] * valid_df.shape[0], index=valid_df.index)

# scores are stored in separate data set, we'll put them in same one
valid_scores = pd.read_csv('data/valid_sample_submission_5_column.csv', delimiter=',')

# put each score in our data set, and make sure to handle essay set 2
for i in xrange(valid_df.shape[0]):
    dom1_predid = valid_df.get_value(i, 'domain1_predictionid')
    row = valid_scores[valid_scores['prediction_id'] == dom1_predid]
    score = row.get_value(row.index[0], 'predicted_score')
    
    dom2_predid = valid_df.get_value(i, 'domain2_predictionid')
    if not np.isnan(dom2_predid):
        assert valid_df.get_value(i, 'essay_set') == 2
        rowB = valid_scores[valid_scores['prediction_id'] == dom2_predid]
        scoreB = rowB.get_value(rowB.index[0], 'predicted_score')
        score += scoreB
        
    valid_df.set_value(i, 'score', score)
        
valid_df = valid_df.drop(['domain1_predictionid', 'domain2_predictionid'], axis=1)
#print valid_df[valid_df['essay_set'] == 2].head()
print valid_df.head()

   essay_id  essay_set                                              essay  \
0      1788          1  Dear @ORGANIZATION1, @CAPS1 more and more peop...   
1      1789          1  Dear @LOCATION1 Time @CAPS1 me tell you what I...   
2      1790          1  Dear Local newspaper, Have you been spending a...   
3      1791          1  Dear Readers, @CAPS1 you imagine how life woul...   
4      1792          1  Dear newspaper, I strongly believe that comput...   

   score  
0      7  
1      8  
2      9  
3      9  
4      9  


In [87]:
# Show nothing is empty in validation set
if valid_df.isnull().any().any():
    print 'Validation data is missing!'
else:
    print 'No missing validation data!'

No missing validation data!


In [88]:
# returned a copy of old_df, with essays cleaned for count vectorizer
def vectorizer_clean(old_df):
    new_df = old_df.copy()
    for i in xrange(new_df.shape[0]):
        new_df.set_value(i, 'essay', " ".join(re.sub('[^a-zA-Z\d\s]', '', new_df['essay'].iloc[i]).lower().split())) 
    return new_df

In [89]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_train = vectorizer_clean(train_df)
print vectorizer_train.head()

   essay_id  essay_set                                              essay  \
0         1          1  dear local newspaper i think effects computers...   
1         2          1  dear caps1 caps2 i believe that using computer...   
2         3          1  dear caps1 caps2 caps3 more and more people us...   
3         4          1  dear local newspaper caps1 i have found that m...   
4         5          1  dear location1 i know having computers has a p...   

   score  std_score  
0      8  -0.052832  
1      9   0.047168  
2      7  -0.152832  
3     10   0.147168  
4      8  -0.052832  


In [90]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_valid = vectorizer_clean(valid_df)
print vectorizer_valid.head()

   essay_id  essay_set                                              essay  \
0      1788          1  dear organization1 caps1 more and more people ...   
1      1789          1  dear location1 time caps1 me tell you what i t...   
2      1790          1  dear local newspaper have you been spending a ...   
3      1791          1  dear readers caps1 you imagine how life would ...   
4      1792          1  dear newspaper i strongly believe that compute...   

   score  
0      7  
1      8  
2      9  
3      9  
4      9  


In [91]:
vectorizer = TfidfVectorizer(stop_words = 'english')

#Get all the text from data
train_essays = vectorizer_train['essay'].values

#Turn each text into an array of word counts
train_vectors = vectorizer.fit_transform(train_essays).toarray()

#normalizing for y
train_std_scores = np.asarray(vectorizer_train['std_score'], dtype="|S6")
print train_std_scores[:5]

['-0.052' '0.0471' '-0.152' '0.1471' '-0.052']


In [57]:
logistic = LogReg(penalty='l2', solver='liblinear', n_jobs=4)
logistic.fit(train_vectors, train_std_scores)

valid_vectors = vectorizer.transform(vectorizer_valid['essay'].values).toarray()

# My guess is we will want to denormalize these scores for quadratic weighted k
valid_pred_std_scores = logistic.predict(valid_vectors)

In [58]:
# Quad Weighted Kappa can be taken from Ben Hamner's publicly available script for metric functions, available at
# https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/quadratic_weighted_kappa.py
# Thanks and credit to Ben Hamner
# However, this relies on int data types, so idk if we can use it.