In [1]:
import pandas as pd

In [2]:
# Read in the data with pandas http://pandas.pydata.org
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
# Lets take a look at the data
train

Unnamed: 0,id,tid,dept,date,forcredit,attendance,textbookuse,interest,grade,tags,comments,helpcount,nothelpcount,online,profgender,profhotness,helpfulness,clarity,easiness,quality
0,24228248,916674,Business,01/05/2015,Yes,,It's a must have,Really into it,,"[""Would take again"", ""Hilarious"", ""Tests are t...",Great Professor My wife took this class twice ...,0,10,,0,0,4,5,3,9
1,24218909,916674,Business,01/02/2015,Yes,Mandatory,It's a must have,Sorta interested,A,"[""Skip class? You won't pass."", ""Tests are tou...",Great Professor Study the notes from class and...,0,1,,0,0,4,4,2,8
2,24215795,916674,Business,01/02/2015,Yes,,Essential to passing,Really into it,,"[""Hilarious"", ""Would take again"", ""Skip class?...",Brother Brau is a great guy He gives great spi...,1,2,,0,0,4,4,3,8
3,24204179,916674,Business,12/30/2014,Yes,Not Mandatory,Essential to passing,Sorta interested,,"[""Tests are tough"", ""Get ready to read""]",People rave about Brau but I personally dont g...,18,6,,0,0,3,1,2,4
4,24198463,916674,Business,12/28/2014,Yes,Not Mandatory,You need it sometimes,Sorta interested,A,"[""Inspirational"", ""Hilarious"", ""Skip class? Yo...",This class doesnt have much homework which was...,1,0,,0,0,4,4,4,8
5,24183494,916674,Business,12/24/2014,Yes,Mandatory,Essential to passing,Really into it,,"[""Clear grading criteria"", ""Would take again"",...",Bro Brau definitely knows what he is doing I ...,1,1,,0,0,4,4,3,8
6,24175674,916674,Business,12/23/2014,Yes,Mandatory,It's a must have,Sorta interested,Not sure yet,"[""Skip class? You won't pass."", ""Participation...",Lectures are long but he does a good job of br...,0,0,,0,0,5,4,3,9
7,24175029,916674,Business,12/23/2014,Yes,Not Mandatory,You need it sometimes,Meh,A,"[""Get ready to read"", ""Participation matters"",...",Can be a good buddy but not a good professor T...,10,5,,0,0,1,1,5,2
8,24172983,916674,Business,12/23/2014,Yes,Not Mandatory,It's a must have,Sorta interested,C+,"[""Skip class? You won't pass."", ""Inspirational""]",I love Brother Brau for his spiritual thoughts...,10,1,,0,0,3,2,1,5
9,24153400,916674,Business,12/20/2014,Yes,Mandatory,It's a must have,Really into it,Not sure yet,"[""Skip class? You won't pass."", ""Hilarious"", ""...",Professor Brau really cares If he talks about...,0,0,,0,0,5,5,2,10


# Notes
In training the id doesn't matter, but this is the column to keep for predictions on test.csv. Aside from that quality is the most important column since its what we are trying to predict

For this tutorial/baseline lets use a simple unigram model of the comments with logistic regresssion

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [22]:
# Make a pipeline to do unigrams then run linear regression
# http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
pipeline = Pipeline([
        ('cv', CountVectorizer()),
        ('regression', LinearRegression())
])

# Use the cross validation feature of sklearn to get a good estimate of the error.
# Before feeding in the comments we fill any nulls as empty strings
# http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html
cv = GridSearchCV(
    pipeline, {}
).fit(train['comments'].fillna(''), train['quality'])
cv

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(steps=[('cv', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=No...ne)), ('regression', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))]),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [23]:
# Output the best score
cv.best_score_

0.20017425535942762

In [24]:
# Make training predictions
predictions = cv.predict(test['comments'].fillna(''))

In [25]:
# Lets take a quick look at the predictions to make sure they are sensible, seems like it
predictions

array([ 10.39862571,   7.60000079,   8.27296863, ...,   8.35007346,
         8.50790916,   6.35229394])

In [26]:
# Finally lets write out the predictions with their id's

with open('predictions.csv', 'w') as f:
    for row_id, prediction in zip(test['id'], predictions):
        f.write('{},{}\n'.format(row_id, prediction))