NOTES/TODO:
1. Standardize essay sets (each essay set should have mean 0 and st.dev of 1)
2. Which crieterion do we use for evaluating our classifier?  I vote kappa.

In [52]:
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

# for modeling
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import LogisticRegressionCV as LogRegCV
from sklearn.cross_validation import cross_val_predict 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cross_validation
from sklearn import random_projection
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score

# TODO if needed, include the words and stopwords imports
# HOWEVER, to use them, you will need to download nltk stuff first if not done already
# To do so, open a python shell (i.e. go to terminal and enter python), and then type
#
# import nltk
# nltk.download()

# After this, select the words and stopwords corpuses, and download them

#import nltk
#from nltk.corpus import stopwords
#from nltk.corpus import words

# Regular expressions might be useful
import re

# Beautiful soup might be useful
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
%matplotlib inline

In [53]:
# Read in training data
# TODO For now, only includes domain1_score, i.e. ignores essay set 2's dual-score system
train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score']
train_df = pd.read_csv('data/training_set_rel3.tsv', delimiter='\t', usecols=train_cols)
train_df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",8
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",9
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",7
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",10
4,5,1,"Dear @LOCATION1, I know having computers has a...",8


In [54]:
# Show nothing is empty in training set
if train_df.isnull().any().any():
    print 'Data is missing!'
else:
    print 'No missing data!'

No missing data!


In [56]:
# Read in validation data
valid_cols = ['essay_id', 'essay_set', 'essay', 'domain1_predictionid', 'domain2_predictionid']
valid_df = pd.read_csv('data/valid_set.tsv', delimiter='\t', usecols=valid_cols)
valid_df['score'] = pd.Series([0.0] * valid_df.shape[0], index=valid_df.index)

# scores are stored in separate data set, we'll put them in same one
valid_scores = pd.read_csv('data/valid_sample_submission_5_column.csv', delimiter=',',
                          converters={'predicted_score': lambda x: float(x)})

# put each score in our data set, and make sure to handle essay set 2
for i in xrange(valid_df.shape[0]):
    dom1_predid = valid_df.get_value(i, 'domain1_predictionid')
    row = valid_scores[valid_scores['prediction_id'] == dom1_predid]
    score = row.get_value(row.index[0], 'predicted_score')
    
    dom2_predid = valid_df.get_value(i, 'domain2_predictionid')
    if not np.isnan(dom2_predid):
        assert valid_df.get_value(i, 'essay_set') == 2
        rowB = valid_scores[valid_scores['prediction_id'] == dom2_predid]
        scoreB = rowB.get_value(rowB.index[0], 'predicted_score')
        score = (score + scoreB) / 2.0
        
    valid_df.set_value(i, 'score', score)
        
valid_df = valid_df.drop(['domain1_predictionid', 'domain2_predictionid'], axis=1)
#print valid_df[valid_df['essay_set'] == 2].head()
valid_df.head()

Unnamed: 0,essay_id,essay_set,essay,score
0,1788,1,"Dear @ORGANIZATION1, @CAPS1 more and more peop...",7.0
1,1789,1,Dear @LOCATION1 Time @CAPS1 me tell you what I...,8.0
2,1790,1,"Dear Local newspaper, Have you been spending a...",9.0
3,1791,1,"Dear Readers, @CAPS1 you imagine how life woul...",9.0
4,1792,1,"Dear newspaper, I strongly believe that comput...",9.0


In [57]:
# Show nothing is empty in validation set
if valid_df.isnull().any().any():
    print 'Data is missing!'
else:
    print 'No missing data!'

No missing data!


In [58]:
# returned a copy of old_df, with essays cleaned for count vectorizer
def vectorizer_clean(old_df):
    new_df = old_df.copy()
    for i in xrange(new_df.shape[0]):
        new_df.set_value(i, 'essay', " ".join(re.sub('[^a-zA-Z\d\s]', '', new_df['essay'].iloc[i]).lower().split())) 
    return new_df

In [59]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_train = vectorizer_clean(train_df)
vectorizer_train.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,dear local newspaper i think effects computers...,8
1,2,1,dear caps1 caps2 i believe that using computer...,9
2,3,1,dear caps1 caps2 caps3 more and more people us...,7
3,4,1,dear local newspaper caps1 i have found that m...,10
4,5,1,dear location1 i know having computers has a p...,8


In [60]:
x_array = vectorizer_train.values[:, 2]

#normalizing for y
norm_y = vectorizer_train.groupby(['essay_set'])[['domain1_score']].apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))

norm_y.head()
y = np.asarray(norm_y, dtype="|S6")
y_array = [x[0] for x in y]

In [61]:
from sklearn.cross_validation import train_test_split
count_vectorizer = CountVectorizer(stop_words = 'english')
tfidf = TfidfTransformer()

#Get all the text from data
corpus = vectorizer_train['essay'].values

vectorizer = CountVectorizer(stop_words='english', min_df=1)
#Turn each text into an array of word counts
x = vectorizer.fit_transform(corpus)
x = x.toarray()

n_samples = len(x)
train_indices = np.random.uniform(size=n_samples) > 1. / 3.  #Select two thirds for train

x_train = x[train_indices]
y_train = y[train_indices]

x_test = x[~train_indices]
y_test = y[~train_indices]

In [None]:
logistic = LogReg(penalty='l2', 
                    solver='liblinear', 
                    n_jobs=4)
logistic.fit(x_train, y_train)

#Print results
y_pred = logistic.predict(x_test)

  y = column_or_1d(y, warn=True)


In [50]:
print 'Accuracy on overall test set:', cohen_kappa_score(y_pred, y_test)

 Accuracy on overall test set: 0.39287456151


In [51]:
tfidf = TfidfTransformer()

#normalizing for y
norm_y = vectorizer_train.groupby(['essay_set'])[['domain1_score']].apply(lambda x: (x - np.mean(x)) / (np.max(x) - np.min(x)))
y = np.asarray(norm_y, dtype="|S6")
y_array = [x[0] for x in y]

x_train, x_test, y_train, y_test = train_test_split(corpus, y_array, test_size=0.33, random_state=42)

tfidf_model = Pipeline([('counts', vectorizer),
                   ('tfidf', tfidf),
                   ('regression', logistic), ])
tfidf_model.fit(x_train, y_train)
#tfidf_model.score(x_test, y_test)

y_pred = tfidf_model.predict(x_test)
print 'Accuracy on overall test set:', cohen_kappa_score(y_pred, y_test)

Accuracy on overall test set: 0.424848658862
