## <span style="color:Orange">Sentiment Analysis of Movie Reviews (Logistic Regression)</span>

In [None]:
from sklearn.datasets import load_files
reviews_train = load_files("D:/Downloads/aclImdb/aclImdb/test/") 
# load_files returns a bunch, containing training texts and training labels
text_train, y_train = reviews_train.data, reviews_train.target
print("type of text_train: {}".format(type(text_train)))

In [None]:
print("length of text_train: {}".format(len(text_train)))

In [None]:
print("text_train[1]:\n{}".format(text_train[1]))

In [None]:
text_train = [doc.replace(b"<br />", b" ") for doc in text_train]

#### The dataset was collected such that the positive class and the negative class balanced, so that there are as many positive as negative strings:

In [None]:
import numpy as np
print("Samples per class (training): {}".format(np.bincount(y_train)))

#### We load the test dataset in the same manner:

In [None]:
reviews_test = load_files("D:/Downloads/aclImdb/aclImdb/train/")
text_test, y_test = reviews_test.data, reviews_test.target
print("Number of documents in test data: {}".format(len(text_test)))
print("Samples per class (test): {}".format(np.bincount(y_test)))
text_test = [doc.replace(b"<br />", b" ") for doc in text_test]

### <span style="color:OrangeRed">Representing Text Data as a Bag of Words</span>

#### <span style="color:Magenta">Bag-of-Words for Movie Reviews</span>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer().fit(text_train)
X_train = vect.transform(text_train)
print("X_train:\n{}".format(repr(X_train)))

In [None]:
feature_names = vect.get_feature_names()
print("Number of features: {}".format(len(feature_names)))
print("First 20 features:\n{}".format(feature_names[:20]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 2000th feature:\n{}".format(feature_names[::2000]))

#### Let’s start by evaluating LogisticRegression using cross-validation:

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
scores = cross_val_score(LogisticRegression(), X_train, y_train, cv=5)
print("Mean cross-validation accuracy: {:.2f}".format(np.mean(scores)))

#### We obtain a mean cross-validation score of 88%, which indicates reasonable performance for a balanced binary classification task. We know that LogisticRegression has a regularization parameter, C, which we can tune via cross-validation:

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)

#### We obtain a cross-validation score of 89% using `C=0.1`. We can now assess the generalization performance of this parameter setting on the test set:

In [None]:
X_test = vect.transform(text_test)
print("{:.2f}".format(grid.score(X_test, y_test)))

In [None]:
vect = CountVectorizer(min_df=5).fit(text_train)
X_train = vect.transform(text_train)
print("X_train with min_df: {}".format(repr(X_train)))

#### By requiring at least five appearances of each token, we can bring down the number of features to 27,271, as seen in the preceding output—only about a third of the original features. Let’s look at some tokens again:

In [None]:
feature_names = vect.get_feature_names()

print("First 50 features:\n{}".format(feature_names[:50]))
print("Features 20010 to 20030:\n{}".format(feature_names[20010:20030]))
print("Every 700th feature:\n{}".format(feature_names[::700]))

#### There are clearly many fewer numbers, and some of the more obscure words or misspellings seem to have vanished. Let’s see how well our model performs by doing a grid search again:

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

### <span style="color:OrangeRed">Stopwords</span>

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
print("Number of stop words: {}".format(len(ENGLISH_STOP_WORDS)))
print("Every 10th stopword:\n{}".format(list(ENGLISH_STOP_WORDS)[::10]))

#### Clearly, removing the stopwords in the list can only decrease the number of features by the length of the list—here, 318—but it might lead to an improvement in performance. Let’s give it a try:

In [None]:
# Specifying stop_words="english" uses the built-in list.
# We could also augment it and pass our own.
vect = CountVectorizer(min_df=5, stop_words="english").fit(text_train)
X_train = vect.transform(text_train)
print("X_train with stop words:\n{}".format(repr(X_train)))

#### There are now 305 (27,271–26,966) fewer features in the dataset, which means that most, but not all, of the stopwords appeared. Let’s run the grid search again:

In [None]:
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

#### The grid search performance decreased slightly using the stopwords—not enough to worry about, but given that excluding 305 features out of over 27,000 is unlikely to change performance or interpretability a lot, it doesn’t seem worth using this list.

#### Fixed lists are mostly helpful for small datasets, which might not contain enough information for the model to determine which words are stopwords from the data itself. 

#### As an exercise, you can try out the other approach, discarding frequently appearing words, by setting the max_df option of CountVectorizer and see how it influences the number of features and the performance.

### <span style="color:OrangeRed">Rescaling the Data with tf–idf</span>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(TfidfVectorizer(min_df=5, norm=None),
LogisticRegression())
param_grid = {'logisticregression__C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(text_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))

## Rescaling data with TF-IDF


In [None]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 3),
    max_features=10000
)

word_vectorizer.fit(text_train)

tfidf_train = word_vectorizer.transform(text_train)
tfidf_test = word_vectorizer.transform(text_test)

In [None]:
print('Shape of tfidf_train:',tfidf_train.shape)
print('Shape of tfidf_test:',tfidf_test.shape)

In [None]:
print('Shape of y_train:',y_train.shape)
print('Shape of y_test:',y_test.shape)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_roc_curve
from sklearn import *


In [None]:
def metrics(model,x,y):
    y_pred = model.predict(x)

    acc = accuracy_score(y, y_pred)
    print("\nAccuracy: ",round(acc,2))    
    plot_confusion_matrix(model,x,y_pred)
    print(classification_report(y, y_pred))
    acc = accuracy_score(y, y_pred)



## Logistic Regression with TF-IDF 


In [None]:
classifier = LogisticRegression()
classifier.fit(tfidf_train, y_train)
metrics(classifier,tfidf_test,y_test)
plot_roc_curve(classifier,tfidf_test,y_test)


## MultinomialNB with TF-IDF


In [None]:
from sklearn.naive_bayes import MultinomialNB

model= MultinomialNB()

model.fit(tfidf_train, y_train)
metrics(model,tfidf_test,y_test)

## Random Forest with TF-IDF

In [None]:
from sklearn.ensemble import RandomForestClassifier as rfm

modelr= rfm()

modelr.fit(tfidf_train, y_train)
metrics(modelr,tfidf_test,y_test)


In [None]:
# max voting ensembler
from sklearn.ensemble import VotingClassifier

eclf = VotingClassifier(estimators=[('lr', classifier), ('rf', modelr), ('mnb', model)], voting='hard')
eclf.fit(X_train, y_train)
metrics1(eclf1,X_test,y_test)