NOTES/TODO:
1. How do we deal with standardized scores in model evaluation?
2. Which criterion do we use for evaluating our classifier? (Quadratic weighted kappa or spearman's correlation)

In [1]:
import matplotlib
import matplotlib.pyplot as plt

import pandas as pd
import numpy as np

# TODO if needed, include the words and stopwords imports
# HOWEVER, to use them, you will need to download nltk stuff first if not done already
# To do so, open a python shell (i.e. go to terminal and enter python), and then type
#
# import nltk
# nltk.download()

# After this, select the words and stopwords corpuses, and download them

#import nltk
#from nltk.corpus import stopwords
#from nltk.corpus import words

# Regular expressions might be useful
import re

# Beautiful soup might be useful
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer

# for modeling
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.linear_model import LogisticRegressionCV as LogRegCV
# from sklearn.cross_validation import cross_val_predict 
# from sklearn import cross_validation
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import cohen_kappa_score
%matplotlib inline


In [2]:
def append_regularized_scores(old_df):
    new_df = old_df.copy()
    new_df['std_score'] = new_df.groupby(['essay_set'])[['score']].apply(lambda x: (x - np.mean(x)) / (np.std(x)))
    return new_df

def create_regularization_data(old_df):
    #getting the number of datasets
    max_essay_set = max(old_df['essay_set'])
    #list of the regularized values
    regularization_data = []
    for i in range(max_essay_set+1):
        mean = np.mean((old_df[old_df['essay_set'] == i + 1])['score'])
        std = np.std((old_df[old_df['essay_set'] == i + 1])['score'])
        regularization_data.append([i + 1, mean, std])
    return regularization_data

In [3]:
# Read in training data
# Note that for essay set 2, score becomes average of 2 domain scores
train_cols = ['essay_id', 'essay_set', 'essay', 'domain1_score', 'domain2_score']
train_df = pd.read_csv('data/training_set_rel3.tsv', delimiter='\t', usecols=train_cols)
for i in xrange(train_df.shape[0]):
    if not np.isnan(train_df.get_value(i, 'domain2_score')):
        assert train_df.get_value(i, 'essay_set') == 2
        new_val = train_df.get_value(i, 'domain1_score') + train_df.get_value(i, 'domain2_score')
        train_df.set_value(i, 'domain1_score', new_val) 
train_df = train_df.drop('domain2_score', axis=1)
train_df = train_df.rename(columns={'domain1_score': 'score'})

################
regularization_data = create_regularization_data(train_df)
train_df = append_regularized_scores(train_df)

print "The regularized data for each essay set = ", regularization_data
print "\n"

#print train_df[train_df['essay_set'] == 2].head()
print train_df.head()
print "\n"

#validate that the standardization works
max_essay_set = max(train_df['essay_set'])
for i in range (max_essay_set):
    valid = train_df[train_df["essay_set"] == i + 1]["std_score"]
    print "mean and standard deviation of essay set " + str(i + 1) + " = ", np.mean(valid), ",", np.std(valid)
################

The regularized data for each essay set =  [[1, 8.528323051037576, 1.5381336495587767], [2, 6.749444444444444, 1.3844371990179603], [3, 1.8482039397450754, 0.8149207612821795], [4, 1.4322033898305084, 0.9395167668768533], [5, 2.4088642659279778, 0.9705520523317599], [6, 2.72, 0.970360757656664], [7, 16.062460165710643, 4.583888354164165], [8, 36.95020746887967, 5.749521294509325], [9, nan, nan]]


   essay_id  essay_set                                              essay  \
0         1          1  Dear local newspaper, I think effects computer...   
1         2          1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3          1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4          1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5          1  Dear @LOCATION1, I know having computers has a...   

   score  std_score  
0      8  -0.343483  
1      9   0.306655  
2      7  -0.993622  
3     10   0.956794  
4      8  -0.343483  


me

In [4]:
# Show nothing is empty in training set
if train_df.isnull().any().any():
    print 'Training data is missing!'
else:
    print 'No missing training data!'

No missing training data!


In [5]:
# Read in validation data
valid_cols = ['essay_id', 'essay_set', 'essay', 'domain1_predictionid', 'domain2_predictionid']
valid_df = pd.read_csv('data/valid_set.tsv', delimiter='\t', usecols=valid_cols)
valid_df['score'] = pd.Series([0] * valid_df.shape[0], index=valid_df.index)

# scores are stored in separate data set, we'll put them in same one
valid_scores = pd.read_csv('data/valid_sample_submission_5_column.csv', delimiter=',')

# put each score in our data set, and make sure to handle essay set 2
for i in xrange(valid_df.shape[0]):
    dom1_predid = valid_df.get_value(i, 'domain1_predictionid')
    row = valid_scores[valid_scores['prediction_id'] == dom1_predid]
    score = row.get_value(row.index[0], 'predicted_score')
    
    dom2_predid = valid_df.get_value(i, 'domain2_predictionid')
    if not np.isnan(dom2_predid):
        assert valid_df.get_value(i, 'essay_set') == 2
        rowB = valid_scores[valid_scores['prediction_id'] == dom2_predid]
        scoreB = rowB.get_value(rowB.index[0], 'predicted_score')
        score += scoreB
        
    valid_df.set_value(i, 'score', score)
        
valid_df = valid_df.drop(['domain1_predictionid', 'domain2_predictionid'], axis=1)
#print valid_df[valid_df['essay_set'] == 2].head()
print valid_df.head()

   essay_id  essay_set                                              essay  \
0      1788          1  Dear @ORGANIZATION1, @CAPS1 more and more peop...   
1      1789          1  Dear @LOCATION1 Time @CAPS1 me tell you what I...   
2      1790          1  Dear Local newspaper, Have you been spending a...   
3      1791          1  Dear Readers, @CAPS1 you imagine how life woul...   
4      1792          1  Dear newspaper, I strongly believe that comput...   

   score  
0      7  
1      8  
2      9  
3      9  
4      9  


In [6]:
# Show nothing is empty in validation set
if valid_df.isnull().any().any():
    print 'Validation data is missing!'
else:
    print 'No missing validation data!'

No missing validation data!


In [7]:
# returned a copy of old_df, with essays cleaned for count vectorizer
# cleaning returns essay with only lowercase words separated by space
def vectorizer_clean(old_df):
    new_df = old_df.copy()
    for i in xrange(new_df.shape[0]):
        new_df.set_value(i, 'essay', " ".join(re.sub('[^a-zA-Z\d\s]', '', new_df['essay'].iloc[i]).lower().split())) 
    return new_df

In [8]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_train = vectorizer_clean(train_df)
print vectorizer_train.head()

   essay_id  essay_set                                              essay  \
0         1          1  dear local newspaper i think effects computers...   
1         2          1  dear caps1 caps2 i believe that using computer...   
2         3          1  dear caps1 caps2 caps3 more and more people us...   
3         4          1  dear local newspaper caps1 i have found that m...   
4         5          1  dear location1 i know having computers has a p...   

   score  std_score  
0      8  -0.343483  
1      9   0.306655  
2      7  -0.993622  
3     10   0.956794  
4      8  -0.343483  


In [9]:
# print essays cleaned for vectorizer (essay is now just lowercase words separated by space) 
vectorizer_valid = vectorizer_clean(valid_df)
print vectorizer_valid.head()

   essay_id  essay_set                                              essay  \
0      1788          1  dear organization1 caps1 more and more people ...   
1      1789          1  dear location1 time caps1 me tell you what i t...   
2      1790          1  dear local newspaper have you been spending a ...   
3      1791          1  dear readers caps1 you imagine how life would ...   
4      1792          1  dear newspaper i strongly believe that compute...   

   score  
0      7  
1      8  
2      9  
3      9  
4      9  


In [10]:
vectorizer = TfidfVectorizer(stop_words = 'english')

vectorizer2 = TfidfVectorizer(stop_words = 'english', ngram_range=(2,2))
vectorizer3 = TfidfVectorizer(stop_words = 'english', ngram_range=(3,3))
vectorizer4 = TfidfVectorizer(stop_words = 'english', ngram_range=(4,4))
vectorizer5 = TfidfVectorizer(stop_words = 'english', ngram_range=(5,5))


#Get all the text from data
train_essays = vectorizer_train['essay'].values

#Turn each text into an array of word counts
train_vectors = vectorizer.fit_transform(train_essays).toarray()

train_vectors2 = vectorizer2.fit_transform(train_essays).toarray()
train_vectors3 = vectorizer3.fit_transform(train_essays).toarray()
train_vectors4 = vectorizer4.fit_transform(train_essays).toarray()
train_vectors5 = vectorizer5.fit_transform(train_essays).toarray()


#normalizing for y
train_std_scores = np.asarray(vectorizer_train['std_score'], dtype="|S6")
print train_std_scores[:5]

['-0.343' '0.3066' '-0.993' '0.9567' '-0.343']


In [11]:
######################################
## TfidfVectorizer with ngram=(1,1) ##
######################################


###############
# Logistic L2 #
###############

# Logistic Model with L2 penalty
logistic_l2 = LogReg(penalty='l2', solver='liblinear', n_jobs=4)
logistic_l2.fit(train_vectors, train_std_scores)

valid_vectors = vectorizer.transform(vectorizer_valid['essay'].values).toarray()

# My guess is we will want to denormalize these scores for quadratic weighted k
valid_pred_std_scores_l2 = logistic_l2.predict(valid_vectors)

# Appending predicted scores to validation data set
valid_df["Log_L2 predicted_scores"] = valid_pred_std_scores_l2

In [12]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L2 predicted_scores']
    for value in current_set:
        stand_pred_values_l2.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l2

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l2'] = stand_pred_values_l2

In [13]:
###############
# Logistic L1 #
###############

# Logistic Model with L1 penalty
logistic_l1 = LogReg(penalty='l1', solver='liblinear', n_jobs=4)
logistic_l1.fit(train_vectors, train_std_scores)

valid_pred_std_scores_l1 = logistic_l1.predict(valid_vectors)


# Appending predicted scores to validation data set
valid_df['Log_L1 predicted_scores'] = valid_pred_std_scores_l1

In [14]:
#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l1 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L1 predicted_scores']
    for value in current_set:
        stand_pred_values_l1.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l1

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l1'] = stand_pred_values_l1

In [16]:
###############
#   Scoring   #
###############

#Scoring the predicted values with the actual values
log_l2_count = 0
log_l1_count = 0
for i in range(len(valid_df)):
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l2']:
        log_l2_count += 1
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l1']:
        log_l1_count += 1
        
print "LOGISTIC L2"
print "Number of correct predictions =", log_l2_count
print "Total number of observations =", len(valid_df)
print "Score =", float(log_l2_count) / len(valid_df)

print ""
print "LOGISTIC L1"
print "Number of correct predictions =", log_l1_count
print "Total number of observations =", len(valid_df)
print "Score =", float(log_l1_count) / len(valid_df)

LOGISTIC L2
Number of correct predictions = 1282
Total number of observations = 4218
Score = 0.303935514462

LOGISTIC L1
Number of correct predictions = 1320
Total number of observations = 4218
Score = 0.312944523471


In [17]:
#Spearman Correlation Coefficient
from scipy.stats import spearmanr as Spearman

print "Logistic L2:", Spearman(a = valid_df["score"], b = valid_df["newly_predicted_scores_log_l2"])
print "Logistic L1:", Spearman(a = valid_df["score"], b = valid_df["newly_predicted_scores_log_l1"])

Logistic L2: SpearmanrResult(correlation=0.91959718560647619, pvalue=0.0)
Logistic L1: SpearmanrResult(correlation=0.9186645533963379, pvalue=0.0)


Below, as we expand ngram length to 2, we see that the computation power required becomes such that the kernel dies.  Therefore, we will have to try limiting the number of words included in the vectorizer for our future models!

In [None]:
######################################
## TfidfVectorizer with ngram=(2,2) ##
######################################

###############
# Logistic L2 #
###############

# Logistic Model with L2 penalty
logistic_l2 = LogReg(penalty='l2', solver='liblinear', n_jobs=4)
logistic_l2.fit(train_vectors2, train_std_scores)

valid_vectors2 = vectorizer2.transform(vectorizer_valid['essay'].values).toarray()

# My guess is we will want to denormalize these scores for quadratic weighted k
valid_pred_std_scores_l2 = logistic_l2.predict(valid_vectors2)

# Appending predicted scores to validation data set
valid_df["Log_L2 predicted_scores_2"] = valid_pred_std_scores_l2

#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l2 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L2 predicted_scores_2']
    for value in current_set:
        stand_pred_values_l2.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l2

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l2_2'] = stand_pred_values_l2

In [None]:
###############
# Logistic L1 #
###############

# Logistic Model with L1 penalty
logistic_l1 = LogReg(penalty='l1', solver='liblinear', n_jobs=4)
logistic_l1.fit(train_vectors2, train_std_scores)

valid_pred_std_scores_l1 = logistic_l1.predict(valid_vectors2)


# Appending predicted scores to validation data set
valid_df['Log_L1 predicted_scores_2'] = valid_pred_std_scores_l1

#denormalizing the values and placing them into the stand_pred_values array
stand_pred_values_l1 = []
for i in range(max_essay_set):
    current_set = valid_df[valid_df['essay_set'] == i + 1]['Log_L1 predicted_scores_2']
    for value in current_set:
        stand_pred_values_l1.append(int(float(value) * float(regularization_data[i][2]) + (regularization_data[i][1])))
# print stand_pred_values_l1

#adding the denormalizede predicted values to the valid_df dataset
valid_df['newly_predicted_scores_log_l1_2'] = stand_pred_values_l1

In [None]:
###############
#   Scoring   #
###############

#Scoring the predicted values with the actual values
log_l2_count_2 = 0
log_l1_count_2 = 0
for i in range(len(valid_df)):
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l2_2']:
        log_l2_count += 1
    if valid_df.iloc[i]['score'] == valid_df.iloc[i]['newly_predicted_scores_log_l1_2']:
        log_l1_count += 1
        
print "LOGISTIC L2"
print "Number of correct predictions =", log_l2_count_2
print "Total number of observations =", len(valid_df)
print "Score =", float(log_l2_count_2) / len(valid_df)

print ""
print "LOGISTIC L1"
print "Number of correct predictions =", log_l1_count_2
print "Total number of observations =", len(valid_df)
print "Score =", float(log_l1_count_2) / len(valid_df)

#Spearman Correlation Coefficient
from scipy.stats import spearmanr as Spearman

print "Logistic L2:", Spearman(a = valid_df["score"], b = valid_df["newly_predicted_scores_log_l2_2"])
print "Logistic L1:", Spearman(a = valid_df["score"], b = valid_df["newly_predicted_scores_log_l1_2"])