# Text Classification on Noisy IMDB Dataset using TPOT

|Specification|Value|
|----|----|
|AutoML Algorithm|TPOT|
|Task|Text Classification|
|Dataset|IMDB Moview Review|
|Dataset Clean|No|

## Load Packages

In [28]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
random_state = 42

In [29]:
from tpot import TPOTClassifier
from tpot.export_utils import set_param_recursive

In [30]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yogesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yogesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [62]:
imdb_df = pd.read_csv("../datasets/noisy/imdb_noisy.csv", index_col=0)

In [63]:
# Sample data for experimentation
imdb_df = imdb_df.sample(frac=0.3, replace=False, random_state=random_state)
imdb_df.reset_index(inplace=True)

In [64]:
imdb_df.rename(columns={'sentiment':'target'}, inplace=True)
imdb_df['target'] = imdb_df['target'].apply(lambda x: 1 if x =='positive' else 0)

In [65]:
def preprocess_text(text):
  tokens = nltk.word_tokenize(text)
  # Remove special characters
  tokens = [word for word in tokens if word.isalpha()]
  # Convert to lower case
  tokens = [word.lower() for word in tokens]
  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  # Lemmatize token
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  
  return ''.join(tokens)

In [66]:
imdb_df['processed_review'] = imdb_df['review'].apply(preprocess_text)

In [67]:
X_train, X_test, y_train, y_test = train_test_split(imdb_df.loc[:, 'processed_review'], 
                                                    imdb_df.loc[:, 'target'], 
                                                    train_size=0.75, test_size=0.25, random_state=random_state)

In [77]:
# See https://github.com/EpistasisLab/tpot/issues/544
vectorizer = TfidfVectorizer(max_features=50)
X_train = vectorizer.fit_transform(X_train).toarray()
X_test = vectorizer.transform(X_test).toarray()

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

## Train the Model

In [72]:
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: 0.5072

Generation 2 - Current best internal CV score: 0.5072

Generation 3 - Current best internal CV score: 0.5072

Generation 4 - Current best internal CV score: 0.5072

Generation 5 - Current best internal CV score: 0.5072

Best pipeline: BernoulliNB(input_matrix, alpha=1.0, fit_prior=True)
0.5037333333333334


In [73]:
tpot.export('best_model_pipeline.py')

In [79]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [80]:
# Average CV score on the training set was: 0.9826086956521738
# exported_pipeline = make_pipeline(
#     Normalizer(norm="l2"),
#     KNeighborsClassifier(n_neighbors=3, p=1, weights="distance")
# )
exported_pipeline = make_pipeline(
    Normalizer(norm="l2"),
    BernoulliNB(alpha=1.0, fit_prior=True)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)

In [82]:
exported_pipeline.score(X_test, y_test)

0.5037333333333334