In [None]:
# Content
# https://github.com/jeffreybreen/twitter-sentiment-analysis-tutorial-201107
    

In [1]:
import urllib
import pandas as pd
import numpy as np 

In [2]:
# define URLs
test_data_url = "https://dl.dropboxusercontent.com/u/8082731/datasets/UMICH-SI650/testdata.txt"
train_data_url = "https://dl.dropboxusercontent.com/u/8082731/datasets/UMICH-SI650/training.txt"

# define local file names
test_data_file_name = 'test_data.csv'
train_data_file_name = 'train_data.csv'

# download files using urlib
test_data_f = urllib.urlretrieve(test_data_url, test_data_file_name)
train_data_f = urllib.urlretrieve(train_data_url, train_data_file_name)

In [3]:
test_data_df = pd.read_csv(test_data_file_name, header=None, delimiter="\t", quoting=3)
test_data_df.columns = ["Text"]
train_data_df = pd.read_csv(train_data_file_name, header=None, delimiter="\t", quoting=3)
train_data_df.columns = ["Sentiment","Text"]


In [4]:
train_data_df.shape

(7086, 2)

In [5]:
test_data_df.shape

(33052, 1)

In [6]:
train_data_df.head()

Unnamed: 0,Sentiment,Text
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [7]:
test_data_df.head()

Unnamed: 0,Text
0,""" I don't care what anyone says, I like Hillar..."
1,have an awesome time at purdue!..
2,"Yep, I'm still in London, which is pretty awes..."
3,"Have to say, I hate Paris Hilton's behavior bu..."
4,i will love the lakers.


In [8]:
train_data_df.Sentiment.value_counts()

1    3995
0    3091
Name: Sentiment, dtype: int64

In [9]:
# let's calculate the average number of words per sentence. 
# We could do the following using a list comprehension with the number of words per sentence.
np.mean([len(s.split(" ")) for s in train_data_df.Text])

10.886819079875812

The class sklearn.feature_extraction.text.CountVectorizer in the wonderful scikit learn Python library converts a collection of text documents to a matrix of token counts. 
This is just what we need to implement later on our bag-of-words linear classifier.

First we need to init the vectoriser. 
We need to remove punctuations, lowercase, remove stop words, and stem words. 
All these steps can be directly performed by CountVectorizer if we pass the right parameter values. 
We can do this as follows. 
Notice that for the stemming step, we need to provide a stemmer ourselves. 
We will use a basic implementation of a Porter Stemmer, a stemmer widely used named after its creator.

In [10]:
import re, nltk
from sklearn.feature_extraction.text import CountVectorizer        
from nltk.stem.porter import PorterStemmer

#######
# based on http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html
stemmer = PorterStemmer()
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    # remove non letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # stem
    stems = stem_tokens(tokens, stemmer)
    return stems
######## 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    stop_words = 'english',
    max_features = 85
)

In [11]:
# Note If you get error as - 'tokenizers/punkt/english.pickle' not found.
# DO THE FOLLOWING
# >>> nltk.download('punkt')
# [nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
# [nltk_data]   Unzipping tokenizers/punkt.zip.
# True
# IF YOU GET ERROR IN THE FOLLOWING STEP

In [12]:
corpus_data_features = vectorizer.fit_transform(train_data_df.Text.tolist() + test_data_df.Text.tolist())

In [None]:
corpus_data_features_nd = corpus_data_features.toarray()
corpus_data_features_nd.shape

In [None]:
vocab = vectorizer.get_feature_names()
print vocab

In [None]:
# Sum up the counts of each vocabulary word
dist = np.sum(corpus_data_features_nd, axis=0)

# For each, print the vocabulary word and the number of times it 
# appears in the data set
for tag, count in zip(vocab, dist):
    print count, tag

In [None]:
#Build Classifier
from sklearn.cross_validation import train_test_split

# remember that corpus_data_features_nd contains all of our 
# original train and test data, so we need to exclude
# the unlabeled test entries
X_train, X_test, y_train, y_test  = train_test_split(
        corpus_data_features_nd[0:len(train_data_df)], 
        train_data_df.Sentiment,
        train_size=0.85, 
        random_state=1234)

In [None]:
# training the classifier
from sklearn.linear_model import LogisticRegression

log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)

In [None]:
# We use the classifier to label our evaluation set. 
# We can use either predict for classes or predict_proba for probabilities.
y_pred = log_model.predict(X_test)

In [None]:
y_pred = log_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
# train classifier
log_model = LogisticRegression()
log_model = log_model.fit(X=corpus_data_features_nd[0:len(train_data_df)], y=train_data_df.Sentiment)

# get predictions
test_pred = log_model.predict(corpus_data_features_nd[len(train_data_df):])

# sample some of them
import random
spl = random.sample(xrange(len(test_pred)), 15)

# print text and labels
for text, sentiment in zip(test_data_df.Text[spl], test_pred[spl]):
    print sentiment, text

In [None]:
test_pred

In [None]:
log_model
