In [None]:
from nltk import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

import numpy as np
import pandas as pd

In [15]:
train_file_path = '../Data/imdb_train.zip'
data_train = pd.read_csv(train_file_path, sep="\t")
data_train

Unnamed: 0,text,label
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1
...,...,...
24995,"Towards the end of the movie, I felt it was to...",0
24996,This is the kind of movie that my enemies cont...,0
24997,I saw 'Descent' last night at the Stockholm Fi...,0
24998,Some films that you pick up for a pound turn o...,0


In [16]:
test_file_path = '../Data/imdb_test.zip'
data_test = pd.read_csv(test_file_path, sep="\t")


In [17]:
def clean_review(review_text, tokenizer, stemmer, stopwords):    
    
    #tokens (eliminamos todos los signos de puntuación)
    words = tokenizer.tokenize(review_text)
    #print(words)
    
    # stemming: raiz y minúsculas:
    stem_words = [stemmer.stem(x) for x in words]
    #print(stem_words)
    
    # eliminamos stopwords (ya pasaron por stem)
    clean_words = [x for x in stem_words if x not in stopwords]
    #print(clean_words)
    
    result = " ".join(clean_words)
    
    return(result)

In [18]:
review_text = data_train.text[1]

print("antes: ", review_text)

#eliminamos todos los signos de puntuación
tokenizer = RegexpTokenizer(r"\w+")

englishStemmer = SnowballStemmer("english")
stopwords_en = stopwords.words('english');
stopwords_en_stem = [englishStemmer.stem(x) for x in stopwords_en]

review_text_clean = clean_review(review_text, tokenizer, englishStemmer, stopwords_en_stem)

antes:  Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school, work, or vote for the matter. Most people think of the homeless as just a lost cause while worrying about things such as racism, the war on Iraq, pressuring kids to succeed, technology, the elections, inflation, or worrying if they'll be next to end up on the streets.<br /><br />But what if you were given a bet to live on the streets for a month without the luxuries you once had from a home, the entertainment sets, a bathroom, pictures on the wall, a computer, and everything you once treasure to see what it's like to be homeless? That is Goddard Bolt's lesson.<br /><br />Mel Brooks (who directs) who stars as Bolt plays a rich man who has everything in the world until deciding to make a bet with a sissy rival (Jeffery Tambor) to see if he can live in the streets for thirty days wi

In [19]:
clean_train = [clean_review(x, tokenizer, englishStemmer, stopwords_en_stem) for x in data_train.text]
#clean_train[0:5]

In [20]:
clean_test = [clean_review(x, tokenizer, englishStemmer, stopwords_en_stem) for x in data_test.text]
#clean_test[0:5]

In [21]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(clean_train)
X_train_sparse = count_vectorizer.transform(clean_train)
X_test_sparse = count_vectorizer.transform(clean_test)



In [22]:
X_train = pd.DataFrame(X_train_sparse.todense(), 
             columns = count_vectorizer.get_feature_names()) 
y_train = data_train.label

In [23]:
X_test = pd.DataFrame(X_test_sparse.todense(), 
             columns = count_vectorizer.get_feature_names()) 
y_test = data_test.label

In [None]:
X_train_train, X_train_val, y_train_train, y_train_val = \
    train_test_split(X_train, y_train, train_size = 0.75, shuffle = True, random_state = 147)

for c in [0.005, 0.008, 0.01, 0.05, 0.25, 0.5, 1]:    
    lr = LogisticRegression(C=c, solver="newton-cg", penalty="l2")    
    lr.fit(X_train_train, y_train_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_train_val, lr.predict(X_train_val))))

In [None]:
final_model = LogisticRegression(C = 0.05, solver="newton-cg", penalty="l2")
final_model.fit(X_train, y_train)
print ("Final Accuracy: %s" 
        % accuracy_score(y_test, final_model.predict(X_test)))
print ("Final Confusion Matrix: \n %s" 
        % confusion_matrix(y_test, final_model.predict(X_test)))

In [None]:
feature_to_coef = pd.DataFrame(columns = ['word', 'coef'])
feature_to_coef.word = count_vectorizer.get_feature_names()
feature_to_coef.coef = final_model.coef_[0]
feature_to_coef_sort_desc = feature_to_coef.sort_values(by = 'coef', ascending = False)
positive_words = feature_to_coef_sort_desc.word[0:3]
positive_words

In [None]:
feature_to_coef_sort_asc = feature_to_coef.sort_values(by = 'coef', ascending = True)
negative_words = feature_to_coef_sort_asc.word[0:3]
negative_words

In [None]:
columns =  np.concatenate([positive_words.values, negative_words.values])
columns

In [None]:
count_vectorizer_bigram = CountVectorizer(ngram_range = (1, 2))
count_vectorizer_bigram.fit(clean_train)
X_train_bigram_sparse = count_vectorizer_bigram.transform(clean_train)
X_test_bigram_sparse = count_vectorizer_bigram.transform(clean_test)

#X_train_bigram = pd.DataFrame(X_train_bigram_sparse.todense(), 
#             columns = count_vectorizer_bigram.get_feature_names()) 
             
#X_test_bigram = pd.DataFrame(X_test_bigram_sparse.todense(), 
#             columns = count_vectorizer_bigram.get_feature_names()) 

#usamos las matrices esparsas porque rompe si trato de convertrlas en densas para esta cantidad de features
X_train_bigram_train, X_train_bigram_val, y_train_train, y_train_val = \
    train_test_split(X_train_bigram_sparse, y_train, train_size = 0.75, shuffle = True, random_state = 147)

for c in [0.01, 0.05, 0.1, 0.25, 0.5, 0.6, 0.7, 0.8, 0.9, 1]:    
    lr = LogisticRegression(C=c, solver="newton-cg", penalty="l2")    
    lr.fit(X_train_bigram_train, y_train_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_train_val, lr.predict(X_train_bigram_val))))

In [None]:
final_model_bigram = LogisticRegression(C = 0.25, solver="newton-cg", penalty="l2")
final_model_bigram.fit(X_train_bigram_sparse, y_train)
print ("Final Accuracy: %s" 
       % accuracy_score(y_test, final_model_bigram.predict(X_test_bigram_sparse)))    
print ("Final Confusion Matrix: \n %s" 
        % confusion_matrix(y_test, final_model_bigram.predict(X_test_bigram_sparse)))

TF-IDF es mejor que CountVectorizer porque no sólo se centra en la frecuencia de las palabras presentes en el corpus, sino que también tiene en cuenta su importancia.

Volvamos a entrenar una regresión logísticas usando como features la representación tf-idf de unigramas, bigramas y trigramas.

Veamos cuáles son los n-gramas más discriminantes.

In [None]:
count_vectorizer_bigram = CountVectorizer(ngram_range = (1, 2))
count_vectorizer_bigram.fit(clean_train)
X_train_bigram_sparse = count_vectorizer_bigram.transform(clean_train)
X_test_bigram_sparse = count_vectorizer_bigram.transform(clean_test)