In [3]:
# Importar los datos y ver sus dimensiones
import urllib
import pandas as pd
train_data_url = "http://www.inf.utfsm.cl/~jnancu/stanford-subset/polarity.train"
test_data_url = "http://www.inf.utfsm.cl/~jnancu/stanford-subset/polarity.dev"
train_data_f = urllib.urlretrieve(train_data_url, "train_data.csv")
test_data_f = urllib.urlretrieve(test_data_url, "test_data.csv")
ftr = open("train_data.csv", "r")
fts = open("test_data.csv", "r")
rows = [line.split(" ",1) for line in ftr.readlines()]
train_df = pd.DataFrame(rows, columns=['Sentiment','Text'])
train_df['Sentiment'] = pd.to_numeric(train_df['Sentiment'])
rows = [line.split(" ",1) for line in fts.readlines()]
test_df = pd.DataFrame(rows, columns=['Sentiment','Text'])
test_df['Sentiment'] = pd.to_numeric(test_df['Sentiment'])
print train_df.shape
print test_df.shape

# Como se puede observar, la dimensionalidad de la data de entrenamiento y la de prueba es de 3554 registros en total.

#Contar cantidad de cada clase
con_neg = 0
con_pos = 0

for val in train_df["Sentiment"]:
    if val > 0:
        con_pos+=1
    else:
        con_neg+=1
        
print "Cantidad clase negativa: ",con_neg
print "Cantidad clase positiva: ",con_pos

# Existen dos clases, textos positivos y negativos. El dataset contiene:

# 1784 textos negativos
# 1770 textos positivos

(3554, 2)
(3554, 2)
Cantidad clase negativa:  1784
Cantidad clase positiva:  1770


In [4]:
import re, time
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer, word_tokenize
from nltk.stem.porter import PorterStemmer

def word_extractor(text):
    stemmer = PorterStemmer()
    commonwords = stopwords.words('english')
    text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
    words = ""
    wordtokens = [ stemmer.stem(word.lower()) \
        for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
    for word in wordtokens:
        if word not in commonwords:
            words+=" "+word
    return words

def word_extractor_sin_stemming(text):
    commonwords = stopwords.words('english')
    text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
    words = ""
    wordtokens = [ word.lower() \
        for word in word_tokenize(text.decode('utf-8', 'ignore')) ]
    for word in wordtokens:
        if word not in commonwords:
            words+=" "+word
    return words

print "---------------------word_extractor---------------------"
print word_extractor("I love to eat cake")
print word_extractor("I love eating cake")
print word_extractor("I loved eating the cake")
print word_extractor("I do not love eating cake")
print word_extractor("I don't love eating cake")

# propias
print word_extractor("I love to play games")
print word_extractor("I love playing games")
print word_extractor("I loved playing the games")
print word_extractor("I do not love playing games")
print word_extractor("I don't love playing games")

print "---------------------word_extractor_sin_stemming---------------------"
print word_extractor_sin_stemming("I love to eat cake")
print word_extractor_sin_stemming("I love eating cake")
print word_extractor_sin_stemming("I loved eating the cake")
print word_extractor_sin_stemming("I do not love eating cake")
print word_extractor_sin_stemming("I don't love eating cake")

# propias
print word_extractor("I love to play games")
print word_extractor("I love playing games")
print word_extractor("I loved playing the games")
print word_extractor("I do not love playing games")
print word_extractor("I don't love playing games")


# Se puede observar que al aplicar el algoritmo word_extractor() captura el tronco lexico base de cada palabra
# en las distintas oraciones. En los 4 primeros ejemplos se obtiene el mismo tronco lexico para las oraciones,
# puesto que se trata solamente de palabras que se le agrega el "ing" o el "ed" al final.
# Tambien se observa que existe diferencia entre poner "do not" y "don't" obteniendose distinto tronco,
# porque en el primer caso se consideran palabras separadas como "do" y "not" por separado.
# Se observa el mismo resultado para las oraciones propias.
# Si no se aplica stemming, no todas las palabras quedan en su tronco lexico base, solo se extraen. 


---------------------word_extractor---------------------
 love eat cake
 love eat cake
 love eat cake
 love eat cake
 n't love eat cake
 love play game
 love play game
 love play game
 love play game
 n't love play game
---------------------word_extractor_sin_stemming---------------------
 love eat cake
 love eating cake
 loved eating cake
 love eating cake
 n't love eating cake
 love play game
 love play game
 love play game
 love play game
 n't love play game


In [6]:
# Funcion igual a la anterior, pero con lematizing en vez de stemming

def word_extractor2(text):
    wordlemmatizer = WordNetLemmatizer()
    commonwords = stopwords.words('english')
    text = re.sub(r'([a-z])\1+', r'\1\1',text)#substitute multiple letter by two
    words = ""
    wordtokens = [ wordlemmatizer.lemmatize(word.lower()) \
            for word in word_tokenize(text.decode('utf-8','ignore')) ]
    for word in wordtokens:
        if word not in commonwords:
            words+=" "+word
    return words


print word_extractor2("I love to eat cake")
print word_extractor2("I love eating cake")
print word_extractor2("I loved eating the cake")
print word_extractor2("I do not love eating cake")
print word_extractor2("I don't love eating cake")

#propias
print word_extractor2("I love to play games")
print word_extractor2("I love playing games")
print word_extractor2("I loved playing the games")
print word_extractor2("I do not love playing games")
print word_extractor2("I don't love playing games")

# Como se puede observar, solo se extraen las palabras presentes en la oracion, exactamente igual que en el caso
# anterior donde sacabamos la aplicacion del proceso de stemming.
# No existen muchas mas diferencias. Simplemente no se llega al tronco lexico que se busca al aplicar stemming.


 love eat cake
 love eating cake
 loved eating cake
 love eating cake
 n't love eating cake
 love play game
 love playing game
 loved playing game
 love playing game
 n't love playing game


In [7]:
# Representacion vectorial del texto de entrenamiento y el de pruebas
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
texts_train = [word_extractor2(text) for text in train_df.Text]
texts_test = [word_extractor2(text) for text in test_df.Text]
vectorizer = CountVectorizer(ngram_range=(1, 1), binary='False')
vectorizer.fit(np.asarray(texts_train))
features_train = vectorizer.transform(texts_train)
features_test = vectorizer.transform(texts_test)
labels_train = np.asarray((train_df.Sentiment.astype(float)+1)/2.0)
labels_test = np.asarray((test_df.Sentiment.astype(float)+1)/2.0)
vocab = vectorizer.get_feature_names()
dist=list(np.array(features_train.sum(axis=0)).reshape(-1,))
dist2=list(np.array(features_test.sum(axis=0)).reshape(-1,))

# Se ordenan las palabras por cantidad
lista_train = zip(vocab, dist)
lista_train.sort(key=lambda x: x[1])
lista_train.reverse()

# Se ordenan las palabras por cantidad
lista_test = zip(vocab, dist2)
lista_test.sort(key=lambda x: x[1])
lista_test.reverse()

#for tag, count in lista_train:
#    print count, tag

N = 10

pals_train = []
pals_test = []

print "Palabras mas frecuentes en el train: "    
for i in range(N):
    tag, count = lista_train[i]
    pals_train.append(tag)
    print count,tag
    
# Mas frecuentes: (566 film,481 movie,246 one,245 like,224 ha,183 make,176 story,163 character,145 comedy,143 time)
    
print "Palabras mas frecuentes en el test: "    
for i in range(N):
    tag, count = lista_test[i]
    pals_test.append(tag)
    print count,tag
    
# Mas frecuentes: (558 film,540 movie,250 one,238 ha,230 like,197 story,175 character,165 time,161 make,134 comedy)


Palabras mas frecuentes en el train: 
566 film
481 movie
246 one
245 like
224 ha
183 make
176 story
163 character
145 comedy
143 time
Palabras mas frecuentes en el test: 
558 film
540 movie
250 one
238 ha
230 like
197 story
175 character
165 time
161 make
134 comedy


In [8]:
from sklearn.metrics import classification_report

# Funcion que evalua el desempeño de un clasificador generico en el conjunto de entrenamiento y de pruebas
def score_the_model(model,x,y,xt,yt,text):
    acc_tr = model.score(x,y)
    acc_test = model.score(xt[:-1],yt[:-1])
    print "Training Accuracy %s: %f"%(text,acc_tr)
    print "Test Accuracy %s: %f"%(text,acc_test)
    print "Detailed Analysis Testing Results ..."
    print(classification_report(yt, model.predict(xt), target_names=['+','-']))
    
    
# Las metricas de classification_report son:
# yt: Corresponde a las y de prueba, es decir, las clasificaciones reales.
# model.predict(xt): Corresponde a la prediccion de los inputs "xt" de prueba, es decir, el y estimado.
# target_names: Corresponde a una lista de strings para mostrar nombres para las etiquetas. En este caso "+" y "-"

In [10]:
from sklearn.naive_bayes import BernoulliNB

def do_NAIVE_BAYES(x,y,xt,yt):
    model = BernoulliNB()
    model = model.fit(x, y)
    score_the_model(model,x,y,xt,yt,"BernoulliNB")
    return model

model=do_NAIVE_BAYES(features_train,labels_train,features_test,labels_test)
test_pred = model.predict_proba(features_test)
spl = random.sample(xrange(len(test_pred)), 15)
for text, sentiment in zip(test_df.Text[spl], test_pred[spl]):
    print sentiment, text
    
    
# Las StopWords son el nombre que se le da a todas aquellas palabras que no tienen ningún atributo
# de búsqueda, es decir, son palabras de significado vacío como los artículos, los pronombres o las preposiciones.
# La importancia de borrar estas palabras es para hacer mas eficiente el analisis de clasificacion, puesto que 
# asi no se pierde tiempo procesando y guardando estas palabras en el algoritmo.

Training Accuracy BernoulliNB: 0.958638
Test Accuracy BernoulliNB: 0.738531
Detailed Analysis Testing Results ...
             precision    recall  f1-score   support

          +       0.75      0.73      0.74      1803
          -       0.73      0.75      0.74      1751

avg / total       0.74      0.74      0.74      3554



NameError: name 'random' is not defined

In [None]:
from sklearn.naive_bayes import MultinomialNB,random

def do_MULTINOMIAL(x,y,xt,yt):
    model = MultinomialNB()
    model = model.fit(x, y)
    score_the_model(model,x,y,xt,yt,"MULTINOMIAL")
    return model
model=do_MULTINOMIAL(features_train,labels_train,features_test,labels_test)
test_pred = model.predict_proba(features_test)
spl = random.sample(xrange(len(test_pred)), 15)

for text, sentiment in zip(test_df.Text[spl], test_pred[spl]):
    print sentiment, text

In [None]:
from sklearn.linear_model import LogisticRegression

def do_LOGIT(x,y,xt,yt):
start_t = time.time()
Cs = [0.01,0.1,10,100,1000]
for C in Cs:
print "Usando C= %f"%C
model = LogisticRegression(penalty='l2',C=C)
model = model.fit(x, y)
score_the_model(model,x,y,xt,yt,"LOGISTIC")
do_LOGIT(features_train,labels_train,features_test,labels_test)

In [None]:
from sklearn.svm import LinearSVC

def do_SVM(x,y,xt,yt):
    Cs = [0.01,0.1,10,100,1000]
    for C in Cs:
        print "El valor de C que se esta probando: %f"%C
        model = LinearSVC(C=C)
        model = model.fit(x, y)
        score_the_model(model,x,y,xt,yt,"SVM")
do_SVM(features_train,labels_train,features_test,labels_test)

In [None]:
#Construir grafico comparativo. Probablemente de errores de entrenamiento y test