In [55]:
import pandas as pd
import numpy as np
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.pipeline import Pipeline

In [56]:
df=pd.read_csv('./IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [57]:
#df=df.sample(10000)

In [58]:
df['sentiment']=df['sentiment'].replace({'positive':1,'negative':0})

In [59]:
X=df['review']
y=df['sentiment']

In [60]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [61]:
def clean_html(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def convert_lower(text):
    return text.lower()

def remove_special(text):
    return re.sub(r'[^a-zA-Z0-9\s]', '', text)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in text.split() if word not in stop_words])

def stem_words(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])

In [62]:
preprocessing = Pipeline(steps=[
    ('html_clean', FunctionTransformer(lambda x: [clean_html(text) for text in x], validate=False)),
    ('lowercase', FunctionTransformer(lambda x: [convert_lower(text) for text in x], validate=False)),
    ('remove_special_chars', FunctionTransformer(lambda x: [remove_special(text) for text in x], validate=False)),
    ('remove_stopwords', FunctionTransformer(lambda x: [remove_stopwords(text) for text in x], validate=False)),
    ('stemming', FunctionTransformer(lambda x: [stem_words(text) for text in x], validate=False))
])

In [63]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)

In [25]:
pipeline=Pipeline([('preprocessing',preprocessing),
                   ('countVectorization',CountVectorizer(max_features=3000)),
                   ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
                  ('gaussian',MultinomialNB())])

In [26]:
pipeline.fit(X_train,y_train)

In [27]:
y_pred=pipeline.predict(X_test)

In [47]:
from sklearn.metrics import accuracy_score

In [29]:
print(accuracy_score(y_test,y_pred))

0.8417


In [None]:
pipeline=Pipeline([('preprocessing',preprocessing),
                   ('countVectorization',CountVectorizer(max_features=1000)),
                   ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
                  ('gaussian',RandomForestClassifier())])

In [None]:
pipeline.fit(X_train,y_train)


In [None]:
y_pred=pipeline.predict(X_test)


In [None]:
print(accuracy_score(y_test,y_pred))


In [64]:
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessing),
    ('count_vectorizer', CountVectorizer(max_features=3000)),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
    ('classifier', RandomForestClassifier())
])

In [65]:
param_grid = {
    'classifier__n_estimators': [100, 200],           # Number of trees in the forest
    'classifier__max_depth': [10, 20, None],           # Maximum depth of the tree
    'classifier__min_samples_split': [2, 5, 10],           # Minimum number of samples required to split a node
    'classifier__min_samples_leaf': [1, 2, 4],             # Minimum number of samples required at each leaf node
    'classifier__bootstrap': [True, False],                # Whether to use bootstrap samples
}

In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=1, verbose=2)

# Fit the GridSearchCV pipeline to the training data
grid_search.fit(X_train, y_train)

# Print out the best parameters found by GridSearchCV
print(f"Best parameters found: {grid_search.best_params_}")

# Evaluate the performance on the test set
best_model = grid_search.best_estimator_
test_predictions = best_model.predict(X_test)

# Print the accuracy
from sklearn.metrics import accuracy_score
print(f"Test Accuracy: {accuracy_score(y_test, test_predictions):.4f}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time= 2.7min
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time= 2.7min
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=100; total time= 2.8min
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time= 3.0min
[CV] END classifier__bootstrap=True, classifier__max_depth=10, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=200; total time= 3.0min
[CV] END classifier__bootstrap=True, classifier__max