# Text Classification on Clean IMDB Dataset using TPOT

|Specification|Value|
|----|----|
|AutoML Algorithm|TPOT|
|Task|Text Classification|
|Dataset|IMDB Moview Review|
|Dataset Clean|Yes|

## Load Packages

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
random_state = 42

In [11]:
from tpot import TPOTClassifier
from tpot.export_utils import set_param_recursive

In [12]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yogesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yogesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
imdb_df = pd.read_csv("../datasets/clean/imdb.csv")

In [14]:
imdb_df.rename(columns={'sentiment':'target'}, inplace=True)
imdb_df['target'] = imdb_df['target'].apply(lambda x: 1 if x =='positive' else 0)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(imdb_df.loc[:, imdb_df.columns != 'target'], 
                                                    imdb_df.loc[:, 'target'], 
                                                    train_size=0.75, test_size=0.25, random_state=random_state)

## Train the Model

In [16]:
def preprocess_text(text):
  tokens = nltk.word_tokenize(text)
  # Remove special characters
  tokens = [word for word in tokens if word.isalpha()]
  # Convert to lower case
  tokens = [word.lower() for word in tokens]
  # Remove stopwords
  stop_words = set(stopwords.words('english'))
  tokens = [word for word in tokens if word not in stop_words]
  # Lemmatize token
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(word) for word in tokens]
  
  return ''.join(tokens)

In [17]:
imdb_df['processed_review'] = imdb_df['review'].apply(preprocess_text)

In [20]:
vectorizer = TfidfVectorizer(max_features=500)
X_train = vectorizer.fit_transform(imdb_df['processed_review']).toarray()
y_train = imdb_df['target']

In [21]:
tpot = TPOTClassifier(generations=5, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))

Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]

In [None]:
# tpot.export('best_model_pipeline.py')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [None]:
# Average CV score on the training set was: 0.9826086956521738
exported_pipeline = make_pipeline(
    Normalizer(norm="l2"),
    KNeighborsClassifier(n_neighbors=3, p=1, weights="distance")
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(X_train, y_train)
results = exported_pipeline.predict(X_test)

In [None]:
exported_pipeline.score(X_test, y_test)