In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, HashingVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

df = pd.read_csv('repeated_letters_words_v9.csv')

In [2]:
ngrams = [(2, i) for i in range(2, 15)]

df = df.sample(frac = 1)

pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    # ('hashing', HashingVectorizer(analyzer='char')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

parameters = {
    # 'vect__ngram_range': ngrams,
    'vect': [CountVectorizer(analyzer='char')],   # CountVectorizer(analyzer='char_wb')
    'tfidf__use_idf': [(True, True), (False, False), (True, False), (False, True)],
    'clf': [MultinomialNB(), DecisionTreeClassifier(), RandomForestClassifier(), SVC(), KNeighborsClassifier()],
}

X_train, X_test, y_train, y_test = train_test_split(df['emphasized'], df['actual'], test_size=0.2, random_state=42)

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best Model:", best_model)
print("Best Parameters:", best_params)

y_pred = best_model.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Best Model: Pipeline(steps=[('vect', CountVectorizer(analyzer='char')),
                ('tfidf', TfidfTransformer(use_idf=(False, False))),
                ('clf', RandomForestClassifier())])
Best Parameters: {'clf': RandomForestClassifier(), 'tfidf__use_idf': (False, False), 'vect': CountVectorizer(analyzer='char')}
Classification Report:
               precision    recall  f1-score   support

         all       1.00      1.00      1.00         9
     amazing       1.00      1.00      1.00        22
     awesome       1.00      1.00      1.00        16
         bad       1.00      1.00      1.00        10
   beautiful       1.00      1.00      1.00        19
      better       1.00      1.00      1.00        18
      boring       1.00      1.00      1.00        14
         but       1.00      1.00      1.00         4
        cool       1.00      1.00      1.00         8
        cute       1.00      1.00      1.00        11
   delicious       1.00      1.00      1.00        25
   exce

In [8]:
best_model.predict(["shttttttttt"])[0]

NameError: name 'best_model' is not defined

In [None]:
with open('model_v11.pkl', 'wb') as file:
    pickle.dump(best_model, file)

In [7]:
import pickle
with open('model_v10.pkl', 'rb') as file:
    model = pickle.load(file)

repeating_word = 'puuuuuuuuuuurfctttttt'
model.predict([repeating_word])[0]

'perfect'