## Imports

In [None]:
# imports
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Dropout
from tensorflow.keras.layers import LSTM
import numpy as np
import pickle
import keras
from tensorflow.keras.constraints import unit_norm
from tensorflow.keras.callbacks import EarlyStopping
'''
from helper_functions import load_data
from helper_functions import my_split
from helper_functions import upsample_minority
from helper_functions import downsample_majority
from helper_functions import model_prep
from helper_functions import get_results
from helper_functions import get_f1
from helper_functions import clean_text
'''

'\nfrom helper_functions import load_data\nfrom helper_functions import my_split\nfrom helper_functions import upsample_minority\nfrom helper_functions import downsample_majority\nfrom helper_functions import model_prep\nfrom helper_functions import get_results\nfrom helper_functions import get_f1\nfrom helper_functions import clean_text\n'

In [None]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 18.6MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.7MB/s eta 0:00:01[K     |████████████▏                   | 30kB 2.2MB/s eta 0:00:01[K     |████████████████▎               | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 2.0MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 2.2MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 2.4MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.1MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [None]:
# imports
import os
import pandas as pd
from sklearn.utils import resample
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
import keras.backend as K
import re


def load_data():
    FILE_PATH = os.path.join(os.getcwd(), 'data', 'large_data.csv')
    return pd.read_csv(FILE_PATH, index_col=None)

def upsample_minority(df):
    counts = df['final_status'].value_counts().index
    majority = counts[0]
    minority = counts[1]

    df_majority = df[df['final_status'] == majority]
    df_minority = df[df['final_status'] == minority]

    majority_class_size = len(df_majority)
    minority_class_size = len(df_minority)

    minority_upsampled = resample(df_minority, 
                replace=True, 
                n_samples=majority_class_size,
                random_state=42) 
    return pd.concat([df_majority, minority_upsampled])

def downsample_majority(df):
    counts = df['final_status'].value_counts().index
    majority = counts[0]
    minority = counts[1]

    df_majority = df[df['final_status'] == majority]
    df_minority = df[df['final_status'] == minority]

    majority_class_size = len(df_majority)
    minority_class_size = len(df_minority)

    majority_downsampled = resample(df_majority, 
                replace=False, 
                n_samples=minority_class_size,
                random_state=42) 
    return pd.concat([df_minority, majority_downsampled])

def my_split(df, year):
    train = df[df['launch_year'] < year]
    test = df[df['launch_year'] == year]
    return train, test

def model_prep(train, test, features, target, onehot=True, scale=True):
    encoder = ce.one_hot.OneHotEncoder(use_cat_names=True)
    scaler = StandardScaler()

    X_train = train[features]
    if onehot:
        X_train = encoder.fit_transform(X_train)
    if scale:
        X_train = scaler.fit_transform(X_train)

    y_train = train[target]

    X_test = test[features]
    if onehot:
        X_test = encoder.transform(X_test)
    if scale:
        X_test = scaler.transform(X_test)
    y_test = test[target]
    return X_train, y_train, X_test, y_test

def get_results(y_true, y_pred):
    accuracy_metric = accuracy_score(y_true, y_pred)
    roc_auc_metric = roc_auc_score(y_true, y_pred)
    f1_metric = f1_score(y_true, y_pred)

    print('-------------------------------')
    print(f'Accuracy Score: {accuracy_metric}')
    print(f'ROC AUC Score: {roc_auc_metric}')
    print(f'F1 Score: {f1_metric}')
    return

# code credit to https://medium.com/@aakashgoel12/how-to-add-user-defined-function-get-f1-score-in-keras-metrics-3013f979ce0d
def get_f1(y_true, y_pred): #taken from old keras source code
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

def clean_text(text):
    tokens = re.sub('[^a-zA-Z 0-9]', '', text)
    tokens = tokens.lower().split()
    return tokens


  import pandas.util.testing as tm


In [None]:
# loading the data
'''
df = load_data()
df.head()
'''
df = pd.read_csv('/content/large_data.csv')

In [None]:
# setting variables
batch_size = 32
max_features = 10000
features = 'name'
target = 'final_status'
maxlen= 10
oov_token = max_features+1

In [None]:
oov_token

10001

In [None]:
# cleaning the data
df[features] = df[features].fillna('')
df[features] = df[features].apply(lambda x: clean_text(x))

In [None]:
# train/test split
year = 2020
train, test = my_split(df, year)

# transforming words to integer values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train[features])

train[features] = tokenizer.texts_to_sequences(train[features])
test[features] = tokenizer.texts_to_sequences(test[features])

# processing data
X_train, y_train, X_test, y_test = model_prep(train, test, features, target, onehot=False, scale=False)
#maxlen = max([len(each) for each in train[features]])

# padding sequences to all be the same length
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

# instantiating the model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

# compiling the model
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy', get_f1])

# fitting the model
history = model.fit(X_train, y_train,
          batch_size=batch_size, 
          epochs=5, 
          validation_data=(X_test,y_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

-------------------------------
Accuracy Score: 0.6829078992756122
ROC AUC Score: 0.7036824448399581
F1 Score: 0.7415840888326658


In [None]:

with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
model.save('basic_model.h5')

In [None]:
# train/test split
year = 2020
train, test = my_split(df, year)

# transforming words to integer values
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train[features])

train[features] = tokenizer.texts_to_sequences(train[features])
test[features] = tokenizer.texts_to_sequences(test[features])

# processing data
X_train, y_train, X_test, y_test = model_prep(train, test, features, target, onehot=False, scale=False)
#maxlen = max([len(each) for each in train[features]])

# padding sequences to all be the same length
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

# instantiating the model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(10, activation='relu'))
model.add(Dropout(.1))
model.add(Dense(1, activation='sigmoid'))

# compiling the model
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

# fitting the model
history = model.fit(X_train, y_train,
          batch_size=batch_size, 
          epochs=5, 
          validation_data=(X_test,y_test))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
model.save('basic_name_model.h5')

In [None]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")
y_true = y_test

get_results(y_true, y_pred)

-------------------------------
Accuracy Score: 0.6866160745084512
ROC AUC Score: 0.7047922140294375
F1 Score: 0.746051712089448


In [None]:
# it appears that model was slightly better so training it with more epochs
# train/test split
year = 2020
train, test = my_split(df, year)

# transforming words to integer values
tokenizer = Tokenizer(num_words=max_features, oov_token=oov_token)
tokenizer.fit_on_texts(train[features])

train[features] = tokenizer.texts_to_sequences(train[features])
test[features] = tokenizer.texts_to_sequences(test[features])

# processing data
X_train, y_train, X_test, y_test = model_prep(train, test, features, target, onehot=False, scale=False)
#maxlen = max([len(each) for each in train[features]])

# padding sequences to all be the same length
X_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post')

# instantiating the model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, 
               dropout=0.2, recurrent_dropout=0.3,
               kernel_constraint=unit_norm(), recurrent_constraint=unit_norm()))
model.add(Dense(1, activation='sigmoid'))

# compiling the model
model.compile(loss='binary_crossentropy',
              optimizer='adam', 
              metrics=['accuracy'])

es = EarlyStopping(monitor='val_loss', mode='min', patience=5)
# fitting the model
history = model.fit(X_train, y_train,
          batch_size=batch_size, 
          epochs=25, 
          validation_data=(X_test,y_test),
          callbacks=[es])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
1928/9896 [====>.........................] - ETA: 6:58 - loss: 0.4714 - accuracy: 0.7588

In [None]:
model.save('name_model.h5')

In [None]:
from google.colab import files

files.download('name_model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
with open('new_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

files.download('new_tokenizer.pickle')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>