### 1) loading data

In [3]:
train = []
test = []
path = 'movie_data/full_train.txt'
for line in open('movie_data/full_train.txt', 'r'):
    train.append(line.strip())
for line in open('movie_data/full_test.txt', 'r'):
    test.append(line.strip())

### 2) Data Prprocssing

In [4]:
import re

In [5]:
##Define rgular expression
REMOVE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\()|(\))|(\[)|(\])") ## remove pontuation bracket
REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)") ##replace them by space
def preprocessing_text(text):
    text = [REMOVE.sub("", line.lower()) for line in text]
    text = [REPLACE_WITH_SPACE.sub(" ", line) for line in text]
    return text


In [6]:
train_preprocess = preprocessing_text(train)

In [7]:
test_preprocess = preprocessing_text(test)

### 3) Vectorization or one hot encoding

In [8]:
from sklearn.feature_extraction.text import CountVectorizer


In [9]:
cv = CountVectorizer(binary=True)
cv.fit(train_preprocess)
X_train =  cv.transform(train_preprocess) ## sparse matrix
X_test = cv.transform(test_preprocess) ## sparse matrix

# 4) Classifier

In [10]:
from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import  accuracy_score
from sklearn.model_selection import  train_test_split

In [11]:
##creating target for training set
target = [1 if i < 12500 else 0 for i in range(25000)]

In [12]:
##splitting data 
x_train, x_val, y_train, y_val = train_test_split(X_train, target, test_size=0.25)

In [13]:
##tuning C hyperparameters for logistic regression
for c in [0.01, 0.02, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(x_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(x_val))))



Accuracy for C=0.01: 0.87024
Accuracy for C=0.02: 0.87632
Accuracy for C=0.25: 0.87664
Accuracy for C=0.5: 0.87568
Accuracy for C=1: 0.8736


### 5) Training the model

In [14]:
model = LogisticRegression(C=0.25)
model.fit(X_train, target)
print ("Accuracy for C=%s: %s" % (c, accuracy_score(target, lr.predict(X_test))))


Accuracy for C=1: 0.86676


In [15]:
feature_to_coef = {
    word: coef for word, coef in zip(cv.get_feature_names(), model.coef_[0])
}
print("="*40)
print("most discriminating positive word")
print("="*40)
for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print(best_positive)
print("="*40)
print("most discriminating negative word")
print("="*40)
for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print(best_negative)

most discriminating positive word
('excellent', 1.221845113337239)
('perfect', 1.0617036184558397)
('refreshing', 1.0252761986496355)
('superb', 0.9405736244442207)
('wonderfully', 0.9347513506886701)
most discriminating negative word
('worst', -1.8189662434932818)
('waste', -1.676084549454925)
('poorly', -1.413817408695419)
('disappointment', -1.4095633351098813)
('awful', -1.3875327059387244)


# Using NLTK

### 1) Removing stop words

In [16]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
def remove_stop_word(text):
    text_without_stop_word = []
    for t in text:
        text_without_stop_word.append(''.join([word for word in t.split() if word not in stop_words]))
    return text_without_stop_word

In [17]:
text_without_stop_word = remove_stop_word(train_preprocess)

### 2) Normalization

#### 2.1) Stemming

In [18]:
def stemming_text(text):
    from nltk.stem.porter import PorterStemmer
    stemmer = PorterStemmer()
    return [''.join(stemmer.stem(word) for word in review.split()) for review in text]
stemming_text = stemming_text(train_preprocess)

#### 2.2) Lemmatization

In [19]:
from nltk.stem import WordNetLemmatizer
def lemmatization_text(text):
    lemmatizer = WordNetLemmatizer()
    return [''.join(lemmatizer.lemmatize(word) for word in review.split()) for review in text]
lemma_text = lemmatization_text(train_preprocess)

### 3) N_gram

In [20]:
ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(train_preprocess)
X_train = ngram_vectorizer.transform(train_preprocess)
X_test = ngram_vectorizer.transform(test_preprocess)

##splitting data 
x_train, x_val, y_train, y_val = train_test_split(X_train, target, test_size=0.25)


In [21]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(x_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, lr.predict(x_val))))

Accuracy for C=0.01: 0.884
Accuracy for C=0.05: 0.89136
Accuracy for C=0.25: 0.89152
Accuracy for C=0.5: 0.89136
Accuracy for C=1: 0.89088


In [23]:
final_ngram = LogisticRegression(C=0.25)
final_ngram.fit(X_train, target)
print ("Final Accuracy: %s" % accuracy_score(target, final_ngram.predict(X_test)))
#print ("Accuracy for C=%s: %s" % (c, accuracy_score(target, lr.predict(X_test))))

Final Accuracy: 0.89768


### 4) SVM

In [24]:
from sklearn.svm import LinearSVC

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1, 2))
ngram_vectorizer.fit(train_preprocess)
X_train = ngram_vectorizer.transform(train_preprocess)
X_test = ngram_vectorizer.transform(test_preprocess)

##splitting data 
x_train, x_val, y_train, y_val = train_test_split(X_train, target, test_size=0.25)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    svm = LinearSVC(C=c)
    svm.fit(x_train, y_train)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_val, svm.predict(x_val))))

Accuracy for C=0.01: 0.892
Accuracy for C=0.05: 0.89008




Accuracy for C=0.25: 0.88976
Accuracy for C=0.5: 0.8904
Accuracy for C=1: 0.89024


In [25]:
final_svm_ngram = LinearSVC(C=0.5)
final_svm_ngram.fit(X_train, target)
print ("Final Accuracy: %s" 
% accuracy_score(target, final_svm_ngram.predict(X_test)))

Final Accuracy: 0.89412


