In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential

# create a dictionary of words with repeating letters and their corresponding actual word
repeating_words = {'goooood': 'good', 'hooome': 'home', 'reeed': 'red', 'coool': 'cool'}

# create lists of repeating words and their corresponding actual words
repeating_word_list = list(repeating_words.keys())
actual_word_list = list(repeating_words.values())

# tokenize the repeating word list
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(repeating_word_list)
sequences = tokenizer.texts_to_sequences(repeating_word_list)

# pad the sequences to have the same length
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# create a sequential model
model = Sequential()

# add an embedding layer to learn the context of the words
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=max_length))

# add a LSTM layer to capture the sequence of the letters in the word
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))

# add another LSTM layer to further capture the sequence of the letters
model.add(LSTM(32))
model.add(Dropout(0.2))

# add a dense output layer with softmax activation to predict the actual word
model.add(Dense(len(actual_word_list), activation='softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model.fit(padded_sequences, tf.keras.utils.to_categorical(actual_word_list), epochs=10, batch_size=16)

# use the model to predict the actual word for a repeating word
test_word = 'goooood'
test_sequence = tokenizer.texts_to_sequences([test_word])
test_padded_sequence = pad_sequences(test_sequence, maxlen=max_length, padding='post')
predicted_index = model.predict_classes(test_padded_sequence)
predicted_word = actual_word_list[predicted_index[0]]
print(f'Predicted actual word for {test_word}: {predicted_word}')

ValueError: invalid literal for int() with base 10: 'good'

In [3]:
repeating_words = {'good': ['goooood', 'gggood', 'gud'],
                   'home': ['hooome', 'hooom', 'hoome'],
                   'red': ['reeed', 'rrred', 'rd'],
                   'cool': ['coool', 'ccool', 'kool']}

repeating_word_list = []
actual_word_list = []

for actual_word, repeating_words in repeating_words.items():
    repeating_word_list += repeating_words
    actual_word_list += [actual_word] * len(repeating_words)

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(repeating_word_list)
sequences = tokenizer.texts_to_sequences(repeating_word_list)

max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=max_length))
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(len(set(actual_word_list)), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(padded_sequences, tf.keras.utils.to_categorical(actual_word_list), epochs=10, batch_size=16)

test_word = 'goooood'
test_sequence = tokenizer.texts_to_sequences([test_word])
test_padded_sequence = pad_sequences(test_sequence, maxlen=max_length, padding='post')
predicted_index = model.predict_classes(test_padded_sequence)
predicted_word = list(set(actual_word_list))[predicted_index[0]]
print(f'Predicted actual word for {test_word}: {predicted_word}')


ValueError: invalid literal for int() with base 10: 'good'

In [29]:
import pandas as pd

# Load the dataset
df = pd.read_csv('repeated_letters_words_v5.csv')

# Group the DataFrame by the unique values in 'column_name_1' and aggregate the corresponding values in 'column_name_2' as a list
# result = df.set_index('actual')['emphasized'].to_dict()
# result = df[['emphasized', 'actual']].to_dict('records')
result = dict(zip(df['emphasized'], df['actual']))

# Print the result
print(result)
# Print the result
# print(result)
# Print the result
# print(result)
# result = df.groupby('actual')['emphasized'].agg(list)

# Print the result
# print(result.to_dict())

# repeating_words = result.to_dict()
#
# repeating_word_list = []
# actual_word_list = []
#
# for actual_word, repeating_words in repeating_words.items():
#     repeating_word_list += repeating_words
#     actual_word_list += [actual_word] * len(repeating_words)

{'likkkkkkkkkkkkke': 'like', 'llllllike': 'like', 'llllllllllike': 'like', 'liiiiiiiiiiiike': 'like', 'likeeeeeeeeee': 'like', 'likkkkkke': 'like', 'likkkkkkkkkke': 'like', 'llllllllllllike': 'like', 'lllike': 'like', 'liike': 'like', 'likkkkkkkkke': 'like', 'likkkkkkkke': 'like', 'likeeeeeeeee': 'like', 'likkkke': 'like', 'likeeeeeee': 'like', 'liiiiiiiiiiiiiiike': 'like', 'lllllllllllllllike': 'like', 'liiiiiiiiiike': 'like', 'lllllllllllike': 'like', 'likkkkkkkkkkkkkke': 'like', 'likkkkke': 'like', 'likeeeeee': 'like', 'liiiike': 'like', 'likeeeeeeeeeeeee': 'like', 'llllike': 'like', 'likkkkkkkkkkkke': 'like', 'likeeeeeeeeeee': 'like', 'liiiiiiiiike': 'like', 'liiiiiiiiiiiiike': 'like', 'liiiiiiiike': 'like', 'liiiiiiiiiiiiiike': 'like', 'lllllllllike': 'like', 'likeee': 'like', 'liiiiiiiiiiike': 'like', 'likkke': 'like', 'likeeeeeeeeeeee': 'like', 'likke': 'like', 'likeeeeeeeeeeeeee': 'like', 'liiiiiiike': 'like', 'liiiiike': 'like', 'likkkkkkkkkkkkkkke': 'like', 'lllllllllllllike'

In [35]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.models import Sequential
import numpy as np
# create a dictionary of words with repeating letters and their corresponding actual word
# repeating_words = {'goooood': 'good', 'hooome': 'home', 'reeed': 'red', 'coool': 'cool'}

# create lists of repeating words and their corresponding actual words
repeating_words = result
repeating_word_list = list(repeating_words.keys())
actual_word_list = list(repeating_words.values())

# create a dictionary to map actual words to integers
actual_word_to_int = {word: i for i, word in enumerate(set(actual_word_list))}

# convert the actual word list to integers
actual_word_list_int = [actual_word_to_int[word] for word in actual_word_list]

# tokenize the repeating word list
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(repeating_word_list)
sequences = tokenizer.texts_to_sequences(repeating_word_list)

# pad the sequences to have the same length
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# create a sequential model
model = Sequential()

# add an embedding layer to learn the context of the words
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=max_length))

# add a LSTM layer to capture the sequence of the letters in the word
model.add(LSTM(32, return_sequences=True))
model.add(Dropout(0.5))

# add another LSTM layer to further capture the sequence of the letters
model.add(LSTM(32))
model.add(Dropout(0.5))

# add a dense output layer with softmax activation to predict the actual word
model.add(Dense(len(actual_word_to_int), activation='softmax'))

# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model.fit(padded_sequences, tf.keras.utils.to_categorical(actual_word_list_int), epochs=10, batch_size=16)

# use the model to predict the actual word for a repeating word
# test_word = 'goooood'
# test_sequence = tokenizer.texts_to_sequences([test_word])
# test_padded_sequence = pad_sequences(test_sequence, maxlen=max_length, padding='post')
# predicted_index = model.predict_classes(test_padded_sequence)
# predicted_word = [k for k, v in actual_word_to_int.items() if v == predicted_index[0]][0]
# print(f'Predicted actual word for {test_word}: {predicted_word}')

test_word = 'goooood'
test_sequence = tokenizer.texts_to_sequences([test_word])
test_padded_sequence = pad_sequences(test_sequence, maxlen=max_length, padding='post')
predicted_probabilities = model.predict(test_padded_sequence)
predicted_index = np.argmax(predicted_probabilities)
predicted_word = actual_word_list[predicted_index]
print(f'Predicted actual word for {test_word}: {predicted_word}')


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Predicted actual word for goooood: like


In [34]:
test_word = 'like'
test_sequence = tokenizer.texts_to_sequences([test_word])
test_padded_sequence = pad_sequences(test_sequence, maxlen=max_length, padding='post')
predicted_probabilities = model.predict(test_padded_sequence)
predicted_index = np.argmax(predicted_probabilities)
predicted_word = actual_word_list[predicted_index]
print(f'Predicted actual word for {test_word}: {predicted_word}')

Predicted actual word for like: like


In [29]:
df = pd.read_csv('repeated_letters_words_v9.csv')

In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# create a dictionary of words with repeating letters and their corresponding actual word
# repeating_words = result
# result = dict(zip(df['emphasized'], df['actual']))


# create a pandas dataframe with the repeating words and their actual words
# df = pd.DataFrame(list(repeating_words.items()), columns=['emphasized', 'actual'])
df = df.sample(frac = 1)

# define the pipeline with CountVectorizer, TfidfTransformer, and a classifier
pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')), #, ngram_range=(2,20)
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

# ngrams = [(2, i) for i in range(2, 21)]
# define the parameters to search for in GridSearchCV
parameters = {
    # 'vect__ngram_range': ngrams,
    'tfidf__use_idf': (True, True),
    'clf': [MultinomialNB(), DecisionTreeClassifier(), RandomForestClassifier(), SVC(), KNeighborsClassifier()],
}

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['emphasized'], df['actual'], test_size=0.1, random_state=42)

# run GridSearchCV to find the best model and parameters
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# print the best model and its parameters
print("Best Model:", best_model)
print("Best Parameters:", best_params)

# use the best model to predict the actual words for the test data
y_pred = best_model.predict(X_test)

# print the classification report, confusion matrix, and accuracy score
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Best Model: Pipeline(steps=[('vect', CountVectorizer(analyzer='char')),
                ('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier())])
Best Parameters: {'clf': RandomForestClassifier(), 'tfidf__use_idf': True}
Classification Report:
               precision    recall  f1-score   support

         all       1.00      1.00      1.00         7
     amazing       1.00      1.00      1.00        12
     awesome       1.00      1.00      1.00        11
         bad       1.00      1.00      1.00         4
   beautiful       1.00      1.00      1.00        13
      better       1.00      1.00      1.00        10
      boring       1.00      1.00      1.00         7
         but       1.00      1.00      1.00         2
        cool       1.00      1.00      1.00         6
        cute       1.00      1.00      1.00         2
   delicious       1.00      1.00      1.00        10
   excellent       1.00      1.00      1.00        12
        fuck       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [37]:

repeating_word = 'waiist'
# predict the actual word for the repeating word using the best model
best_model.predict([repeating_word])[0]

import pickle
with open('model_v9.pkl', 'wb') as file:
    pickle.dump(best_model, file)


with open('model_v9.pkl', 'rb') as file:
    model = pickle.load(file)

model.predict([repeating_word])[0]



'waste'

In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# define the pipeline for each individual classifier
nb_pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB())
])

dt_pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    ('tfidf', TfidfTransformer()),
    ('clf', DecisionTreeClassifier())
])

rf_pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier())
])

svc_pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    ('tfidf', TfidfTransformer()),
    ('clf', SVC())
])

knn_pipeline = Pipeline([
    ('vect', CountVectorizer(analyzer='char')),
    ('tfidf', TfidfTransformer()),
    ('clf', KNeighborsClassifier())
])

# define the ensemble classifier that combines the predictions of the individual classifiers
ensemble = VotingClassifier(estimators=[
    ('nb', nb_pipeline),
    ('dt', dt_pipeline),
    ('rf', rf_pipeline),
    ('svc', svc_pipeline),
    ('knn', knn_pipeline)
], voting='hard')

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['emphasized'], df['actual'], test_size=0.1, random_state=42)

# train the ensemble classifier on the training data
ensemble.fit(X_train, y_train)

# use the ensemble classifier to predict the actual words for the test data
y_pred = ensemble.predict(X_test)

# print the classification report, confusion matrix, and accuracy score
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

         all       1.00      1.00      1.00         7
     amazing       1.00      1.00      1.00        12
     awesome       1.00      1.00      1.00        11
         bad       1.00      1.00      1.00         4
   beautiful       1.00      1.00      1.00        13
      better       1.00      1.00      1.00        10
      boring       1.00      1.00      1.00         7
         but       1.00      1.00      1.00         2
        cool       1.00      1.00      1.00         6
        cute       1.00      1.00      1.00         2
   delicious       1.00      1.00      1.00        10
   excellent       1.00      1.00      1.00        12
        fuck       1.00      1.00      1.00         3
         fun       1.00      1.00      1.00         6
        good       1.00      1.00      1.00         8
       great       1.00      1.00      1.00         8
       happy       1.00      1.00      1.00         9
   

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
ensemble.predict(["waay"])[0]

import pickle
with open('model_ensemble_v1.pkl', 'wb') as file:
    pickle.dump(ensemble, file)

with open('model_ensemble_v1.pkl', 'rb') as file:
    model = pickle.load(file)



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


'waste'

In [49]:

with open('model_v9.pkl', 'rb') as file:
    model_normal = pickle.load(file)


In [52]:
# model.predict(["perfect"])[0]
model_normal.predict(["f****k"])[0]

'fuck'

In [92]:
import pickle
with open('model_v2.pkl', 'wb') as file:
    pickle.dump(best_model, file)