In [1]:
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [2]:
df = pd.read_csv("imdb_dataset.csv") 
print (len(df))

50000


In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
print (df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [5]:
reviews = list(df["review"])
sentiments = list(df["sentiment"])
print (set(sentiments))

{'positive', 'negative'}


In [7]:
balanced_reviews = reviews
balanced_sentiments = []
for x in sentiments:
    if x == 'positive':
        balanced_sentiments.append(1)
    else:
        balanced_sentiments.append(0)
print (len(balanced_reviews))
print (len(balanced_sentiments))
print (sum(balanced_sentiments))

50000
50000
25000


In [8]:
# positive_reviews = [r for r, s in zip(reviews, sentiments) if s == "positive"]
# negative_reviews = [r for r, s in zip(reviews, sentiments) if s == "negative"]

# pos_sample_size = int(0.60 * len(positive_reviews))
# neg_sample_size = int(0.60 * len(negative_reviews))

# sampled_positive = random.sample(positive_reviews, pos_sample_size)
# sampled_negative = random.sample(negative_reviews, neg_sample_size)

# balanced_reviews = sampled_positive + sampled_negative
# balanced_sentiments = ["positive"] * pos_sample_size + ["negative"] * neg_sample_size

# combined = list(zip(balanced_reviews, balanced_sentiments))
# random.shuffle(combined)
# balanced_reviews, balanced_sentiments = zip(*combined)

# balanced_reviews = list(balanced_reviews)
# balanced_sentiments = list(balanced_sentiments)

# print (len(balanced_reviews))
# print (len(balanced_sentiments))

In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(balanced_reviews, balanced_sentiments, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print (len(X_train), len(y_train))
print (len(X_val), len(y_val))
print (len(X_test), len(y_test))

30000 30000
10000 10000
10000 10000


In [10]:
vectorizer = TfidfVectorizer(stop_words="english")
X_train_tfidf = vectorizer.fit_transform(X_train)
print (len(vectorizer.vocabulary_))

82225


In [11]:
vectorizer = TfidfVectorizer(stop_words="english", max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
print (len(vectorizer.vocabulary_))

10000


In [12]:
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.95, min_df=0.05)
X_train_tfidf = vectorizer.fit_transform(X_train)
print (len(vectorizer.vocabulary_))

252


In [13]:
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,3), max_df=0.95, min_df=0.025)
X_train_tfidf = vectorizer.fit_transform(X_train, )
print (len(vectorizer.vocabulary_))

636


In [14]:
# vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
# X_train_tfidf = vectorizer.fit_transform(X_train)
# print (len(vectorizer.vocabulary_))

In [15]:
#vectorizer.vocabulary_

In [16]:
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [17]:
X_val_tfidf

<10000x636 sparse matrix of type '<class 'numpy.float64'>'
	with 409472 stored elements in Compressed Sparse Row format>

In [18]:
#X_val_tfidf[9999].data

In [19]:
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

LogisticRegression()

In [20]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", val_accuracy)

Trainn Accuracy: 0.8542666666666666


In [21]:
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)

Validation Accuracy: 0.8505


In [22]:
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.8467


In [23]:
model = RandomForestClassifier(n_estimators=30)
model.fit(X_train_tfidf, y_train)

RandomForestClassifier(n_estimators=30)

In [24]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", val_accuracy)
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Trainn Accuracy: 0.9997
Validation Accuracy: 0.808
Test Accuracy: 0.8104


In [25]:
model = RandomForestClassifier(n_estimators=100, max_depth=20)
model.fit(X_train_tfidf, y_train)

RandomForestClassifier(max_depth=20)

In [26]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", val_accuracy)
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Trainn Accuracy: 0.9065333333333333
Validation Accuracy: 0.8113
Test Accuracy: 0.8067


In [27]:
model = KNeighborsClassifier(n_neighbors=7)
model.fit(X_train_tfidf, y_train)

KNeighborsClassifier(n_neighbors=7)

In [28]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", val_accuracy)
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Trainn Accuracy: 0.7945333333333333
Validation Accuracy: 0.7083
Test Accuracy: 0.7023


In [39]:
model = XGBClassifier(n_estimators=50, max_depth=5)
model.fit(X_train_tfidf, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=5, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=50, n_jobs=None,
              num_parallel_tree=None, random_state=None, ...)

In [40]:
y_train_pred = model.predict(X_train_tfidf)
val_accuracy = accuracy_score(y_train, y_train_pred)
print("Train Accuracy:", val_accuracy)
y_val_pred = model.predict(X_val_tfidf)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation Accuracy:", val_accuracy)
y_test_pred = model.predict(X_test_tfidf)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test Accuracy:", test_accuracy)

Trainn Accuracy: 0.8655333333333334
Validation Accuracy: 0.821
Test Accuracy: 0.8144


In [42]:
len(y_test_pred)

10000

In [43]:
def ensamble():
    model = LogisticRegression()
    model.fit(X_train_tfidf, y_train)
    y_pred1 = model.predict(X_test_tfidf)
    
    model = RandomForestClassifier(n_estimators=100, max_depth=20)
    model.fit(X_train_tfidf, y_train)
    y_pred2 = model.predict(X_test_tfidf)
    
    model = RandomForestClassifier(n_estimators=100, max_depth=20)
    model.fit(X_train_tfidf, y_train)
    y_pred3 = model.predict(X_test_tfidf)
    
    y_pred = []
    for i in range(len(y_pred1)):
        if y_pred1[i] + y_pred2[i] + y_pred3[i] >= 2:
            y_pred.append(1)
        else:
            y_pred.append(0)
            
    acc = accuracy_score(y_test, y_pred)
    return acc

In [44]:
acc = ensamble()
print ("Enemble acc:", acc)

Enemble acc: 0.818
