In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('reviews_dataset2.csv')
print(df.head())

                                                text  label
0  one best crichton novel sphere michael crichto...      1
1  medicine future z accomplished heart surgeon f...      1
2  beautiful gorgeous network comic book contains...      1
3  lover robicheaux book lover robicheaux demon s...      1
4  excellent broad survey development civilizatio...      1


In [3]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.3, random_state=42)

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import multiprocessing

vectorizers = [
    TfidfVectorizer(max_features=7000),
    CountVectorizer(),
    HashingVectorizer(n_features=2**20, alternate_sign=False)
]

classifiers = [
    MultinomialNB(),
    LogisticRegression(max_iter=1000),
    SVC(kernel='linear'),
    RandomForestClassifier(n_estimators=100, random_state=42),
    GradientBoostingClassifier(n_estimators=100, random_state=42),
]

models = []

for vectorizer in vectorizers:
    for classifier in classifiers:
        model = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])

        models.append((type(vectorizer).__name__ + type(classifier).__name__, model))

def train_model(name_model):
    name, model = name_model
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print(f'For {name}:')
    #print(classification_report(y_test, y_pred))
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    return (name, model)

def train_models(models):
    results = [train_model(model) for model in models]
    print('Next chunk of models was finished')
    return results

chunk_size = 1
chunks = [models[i:i + chunk_size] for i in range(0, len(models), chunk_size)]

#with concurrent.futures.ThreadPoolExecutor() as executor:
#    results = list(executor.map(spelling_corrections, chunks))
with multiprocessing.Pool() as pool:
    result_models = pool.map(train_models, chunks)


For CountVectorizerMultinomialNB:
Accuracy: 0.8710440418184399
Next chunk of models was finished
For TfidfVectorizerMultinomialNB:
Accuracy: 0.8492833151993945
Next chunk of models was finished
For HashingVectorizerMultinomialNB:
Accuracy: 0.810255925067411
Next chunk of models was finished
For TfidfVectorizerLogisticRegression:
Accuracy: 0.9025970954160556
Next chunk of models was finished
For HashingVectorizerLogisticRegression:
Accuracy: 0.9003737168267184
Next chunk of models was finished
For CountVectorizerLogisticRegression:
Accuracy: 0.9003737168267184
Next chunk of models was finished
For CountVectorizerGradientBoostingClassifier:
Accuracy: 0.8560480628222716
Next chunk of models was finished
For TfidfVectorizerRandomForestClassifier:
Accuracy: 0.8744027626661621
Next chunk of models was finished
For TfidfVectorizerGradientBoostingClassifier:
Accuracy: 0.8594067836699939
Next chunk of models was finished
For CountVectorizerRandomForestClassifier:
Accuracy: 0.8519324471356261
Ne

In [33]:
# save best model to file
import pickle

print(len(result_models))

for models in result_models:
    name, model = models[0]
    print(name)
    if name == "HashingVectorizerSVC":
        print("Found!")
        with open('HashingVectorizerSVC.pkl', 'wb') as file:
            pickle.dump(model, file)

15
TfidfVectorizerMultinomialNB
TfidfVectorizerLogisticRegression
TfidfVectorizerSVC
TfidfVectorizerRandomForestClassifier
TfidfVectorizerGradientBoostingClassifier
CountVectorizerMultinomialNB
CountVectorizerLogisticRegression
CountVectorizerSVC
CountVectorizerRandomForestClassifier
CountVectorizerGradientBoostingClassifier
HashingVectorizerMultinomialNB
HashingVectorizerLogisticRegression
HashingVectorizerSVC
Found!
HashingVectorizerRandomForestClassifier
HashingVectorizerGradientBoostingClassifier


In [None]:
# using GridSearchCV to try to find best parameters

from sklearn.model_selection import GridSearchCV

param_grid = {
    'vectorizer__max_features': [100, 500, 1000, 2000],
    'classifier__C': [0.01, 0.1, 1, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

print(f"The best parameters: {grid_search.best_params_}")
y_pred = grid_search.predict(x_test)
print(classification_report(y_test, y_pred))

In [4]:
# using StackingClassifier to mix most successful models in one
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import StackingClassifier

estimators = [
    ('lr', LogisticRegression(max_iter=1000)),
    ('svm', SVC(kernel='linear', probability=True))
]

model_stacking = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=7000)),
    ('classifier', StackingClassifier(estimators=estimators))
])

model_stacking.fit(x_train, y_train)
y_pred = model_stacking.predict(x_test)
print("Stacking Classifier")
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

NameError: name 'LogisticRegression' is not defined