In [1]:
# All of our imports
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional, Flatten, BatchNormalization
import tensorflow as tf
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# This function cleans comments by removing stopwords, lemmatizing words, removing links and emojis

def commentCleaner(comments):
    cleaned_comments = []
    for comment in comments:
        # Remove special symbols, emojis, reddit username mentions, and hyperlinks
        comment = re.sub(r"[^\w\s]|http\S+|www\S+|u/[A-Za-z0-9_-]+", "", comment)
        comment = comment.lower()
        # Tokenize the comment
        tokens = comment.split()
        # tokens = comment.split(' ')
        # Remove stop words
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        # Join the tokens back into a single string
        cleaned_comment = " ".join(tokens)
        cleaned_comments.append(cleaned_comment)   
    return cleaned_comments

print(commentCleaner(["One of the other reviewers mentioned watching 1 oz episode"]))


['one reviewer mentioned watching 1 oz episode']


In [4]:
# Tokenizes and pads comments for feeding into our keras model

def tokenizeComments(comments, tokenizer):
    print("Comments recieved for tokenization: ")
    print(comments)
    print("Fitted tokenizer to sample texts")
    tokenized_comments = tokenizer.texts_to_sequences(comments)
    print("Converted to sequences")
    tokenized_comments = pad_sequences(tokenized_comments, 235)
    print("Padded succesfully")
    print(tokenized_comments)
    return tokenized_comments

In [69]:
# The model we are going to use

def classification_model():
    # Building our model
    model = keras.Sequential()
    model.add(Embedding(18364, 256, input_length = 235))
    model.add(SpatialDropout1D(0.5))
    
    model.add(Bidirectional(LSTM(units=128, dropout=0.6)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(2,activation='softmax'))


    model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

In [6]:
data1 = pd.read_csv('stock_data.csv')

def changeNegativetoZero(val):
    if val == -1:
        return 0
    return val

# changeNegativetoZero = {'-1' : '0', '1' : '1'}
data1['Sentiment'] = data1['Sentiment'].apply(changeNegativetoZero)
data1

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1
...,...,...
5786,Industry body CII said #discoms are likely to ...,0
5787,"#Gold prices slip below Rs 46,000 as #investor...",0
5788,Workers at Bajaj Auto have agreed to a 10% wag...,1
5789,"#Sharemarket LIVE: Sensex off day’s high, up 6...",1


In [7]:
data2 = pd.read_csv('sent_train.csv')
print(data2['label'].unique())
# We are going to drop all neutral rows
data2.rename(columns={'text': 'Text', 'label': 'Sentiment'}, inplace=True, errors='raise')
data2 = data2.drop(data2[data2['Sentiment'] == 2].index)
print(data2['Sentiment'].unique())
data2


[0 1 2]
[0 1]


Unnamed: 0,Text,Sentiment
0,$BYND - JPMorgan reels in expectations on Beyo...,0
1,$CCL $RCL - Nomura points to bookings weakness...,0
2,"$CX - Cemex cut at Credit Suisse, J.P. Morgan ...",0
3,$ESS: BTIG Research cuts to Neutral https://t....,0
4,$FNKO - Funko slides after Piper Jaffray PT cu...,0
...,...,...
9470,Why Slack Stock Just Surged 17%,1
9471,Why Tiffany Stock Just Popped 6%,1
9472,WiLAN signs wireless license with LG; shares +...,1
9473,"Yandex authorizes share repurchase program, sh...",1


In [8]:
data3 = pd.read_csv('sent_valid.csv')
print(data3['label'].unique())
# We are going to drop all neutral rows
data3.rename(columns={'text': 'Text', 'label': 'Sentiment'}, inplace=True, errors='raise')
data3 = data3.drop(data3[data3['Sentiment'] == 2].index)
print(data3['Sentiment'].unique())
data3

[0 1 2]
[0 1]


Unnamed: 0,Text,Sentiment
0,$ALLY - Ally Financial pulls outlook https://t...,0
1,"$DELL $HPE - Dell, HPE targets trimmed on comp...",0
2,$PRTY - Moody's turns negative on Party City h...,0
3,$SAN: Deutsche Bank cuts to Hold,0
4,$SITC: Compass Point cuts to Sell,0
...,...,...
2366,Tesla's stock ticks up after Deutsche Bank lif...,1
2367,Transocean up 4% on new $91M contract,1
2368,Trevena refiles U.S. application for IV olicer...,1
2369,Unisys's stock soars 16% premarket after sale ...,1


In [9]:
data4 = pd.read_csv('sentiment.csv')

# We are going to remove tweet url and stock ticker columns
mapping = {'Negative' : 0, 'Positive': 1}
data4['Sentiment'] = data4['Sentiment'].map(mapping)

data4 = data4.drop(columns=['Stock Ticker', 'Tweet URL'])
data4.rename(columns={'Tweet Text': 'Text'}, inplace=True, errors='raise')
data4

Unnamed: 0,Text,Sentiment
0,"Ruh-roh, *someone's* first ex wife is dancing ...",0
1,"Nice 9% pre market move for $para, pump my cal...",1
2,"I'm an investor in $LOW, but I have no problem...",1
3,Everyone knew you would back out of the deal w...,0
4,"SELL EVERYTHING, BUY $TSLA 🚀🌕",1
...,...,...
495,$NVDA beautifully bullish,1
496,"$IDRA +58% \r\n""According to topline data, SLN...",1
497,Replying to \r\n@NewsoftheMarket\r\ncheck out ...,1
498,I’ve noticed a clear trend the past year: FB n...,0


In [10]:
data5 = pd.read_csv('augmented_data.csv')
data5 = data5.drop(columns=['Unnamed: 0'])
data5 = data5.drop(data5[data5['Sentiment'] == 'neutral'].index)
# data5['Sentiment'].unique()
data5.rename(columns={'Sentence': 'Text', 'Sentiment': 'Sentiment'}, inplace=True, errors='raise')
data5

Unnamed: 0,Text,Sentiment
0,The GeoSolutions technology will leverage Bene...,positive
1,"$ESI on lows, down $1.50 to $2.50 BK a real po...",negative
2,"For the last quarter of 2010 , Componenta 's n...",positive
5,$SPY wouldn't be surprised to see a green close,positive
6,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
...,...,...
6617,"$SBUX down PM, from $DB downgrade.. PT shorten...",negative
6618,Finnish developer and manufacturer of mobile p...,negative
6619,Operating profit savage to EUR 38.1 mn from EU...,negative
6620,HSBC articulate Unit to Book $585 Million Char...,negative


In [11]:
print(data1.head())
print(data2.head())
print(data3.head())
print(data4.head())
print(data5.head())

                                                Text  Sentiment
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1  user: AAP MOVIE. 55% return for the FEA/GEED i...          1
2  user I'd be afraid to short AMZN - they are lo...          1
3                                  MNTA Over 12.00            1
4                                   OI  Over 21.37            1
                                                Text  Sentiment
0  $BYND - JPMorgan reels in expectations on Beyo...          0
1  $CCL $RCL - Nomura points to bookings weakness...          0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...          0
3  $ESS: BTIG Research cuts to Neutral https://t....          0
4  $FNKO - Funko slides after Piper Jaffray PT cu...          0
                                                Text  Sentiment
0  $ALLY - Ally Financial pulls outlook https://t...          0
1  $DELL $HPE - Dell, HPE targets trimmed on comp...          0
2  $PRTY - Moody's turns negative on Par

In [12]:
from textattack.augmentation import EasyDataAugmenter

import random

augmenter = EasyDataAugmenter()

def augment_text(sentence):
    augmented_sentences = augmenter.augment(sentence)
    if augmented_sentences:
        return random.choice(augmented_sentences)
    else:
        return sentence 


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package omw-1.4 to /Users/aadeesh/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [13]:
import random
def augmentDataFrame(df):
    augmented_data = []
    augmented_labels = []
    count = 0
    for sentence, sentiment in zip(df.Text, df.Sentiment):
        random_num = random.randint(1, 100)
        if (random_num >= 30 and sentiment == 0):
            augmented_sentence = augment_text(sentence)
            augmented_data.append(augmented_sentence)
            augmented_labels.append(sentiment)
            count += 1
        if (count > 2554):
            break
    
    new_df = {"Text" : augmented_data, "Sentiment" : augmented_labels}
    new_df = pd.DataFrame(new_df)

    df = pd.concat([df, new_df])
    # df = df.append(pd.DataFrame(new_df))
    return df

In [14]:
final_data = pd.concat([data1, data2, data3, data4, data5], ignore_index=True)
print(final_data.head())
print(final_data.info())
print(final_data['Sentiment'].unique())

def changeStringToNum(val):
    if val == 'negative':
        return 0
    elif val == 'positive':
        return 1
    return val

final_data['Sentiment'] = final_data['Sentiment'].apply(changeStringToNum)
print(final_data['Sentiment'].unique())
print(final_data.head())

final_data.to_csv('final_data.csv')

                                                Text Sentiment
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...         1
1  user: AAP MOVIE. 55% return for the FEA/GEED i...         1
2  user I'd be afraid to short AMZN - they are lo...         1
3                                  MNTA Over 12.00           1
4                                   OI  Over 21.37           1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13970 entries, 0 to 13969
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       13970 non-null  object
 1   Sentiment  13970 non-null  object
dtypes: object(2)
memory usage: 218.4+ KB
None
[1 0 'positive' 'negative']
[1 0]
                                                Text  Sentiment
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1  user: AAP MOVIE. 55% return for the FEA/GEED i...          1
2  user I'd be afraid to short AMZN - they are lo...          1
3                    

In [15]:

pos_count = final_data['Sentiment'].sum()
neg_count = (final_data['Sentiment'] == 0).sum()
print(pos_count)
print(neg_count)
print(neg_count / pos_count)

8262
5708
0.6908738804163641


In [16]:
checkpoint_path = "trial3/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# Create a ModelCheckpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    checkpoint_path,
    monitor='loss',
    save_weights_only=False,
    save_best_only=True,
    verbose=1
)

# Create an EarlyStopping callback to stop training if validation loss doesn't improve
early_stopping_callback = EarlyStopping(
    monitor='loss',
    patience=5,  # Number of epochs with no improvement after which training will stop
    verbose=1
)

In [17]:
class textTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        print("Starting fitting")
        return self
    
    def transform(self, X, y=None):
        print("Starting transform")
        print(X)
        # tokenizerFinal = Tokenizer(num_words=1000, split=' ') 
        # print(cleaned_data['Sentence'].values)
        # tokenizerFinal.fit_on_texts(cleaned_data['Sentence'].values)
        X_cleaned = commentCleaner(X)
        print("Cleaned comments")
        print("Starting tokenization")
        X_tokenized = tokenizeComments(X_cleaned, self.tokenizer)
        print("Tokenized")
        print("Ending transform")

        return X_tokenized

In [18]:

string = ""
for i in final_data['Text'].values:
    string += i + ' '
print(len(set(string.split())))

36819


In [19]:
# Getting the vocabulary size, longest sentence, and average length
x = final_data['Text'].values
print(x)
x = commentCleaner(x)
print(x)
string2 = ""
maxlen = 0
totallen = 0
count = 0
for i in x:
    string += i
    string2 += i + ' '
    # print(string)
    if len(i) > maxlen:
        maxlen = len(i)
    totallen += len(i)
    count += 1
print(string2)
print(len(string), len(string2))
print(string == string2)
print(len(set(string2.split())))
print(maxlen)
print(totallen/count)


['Kickers on my watchlist XIDE TIT SOQ PNK CPW BPZ AJ  trade method 1 or method 2, see prev posts'
 'user: AAP MOVIE. 55% return for the FEA/GEED indicator just 15 trades for the year.  AWESOME.  '
 "user I'd be afraid to short AMZN - they are looking like a near-monopoly in eBooks and infrastructure-as-a-service"
 ... 'Operating profit savage to EUR 38.1 mn from EUR 55.3 mn in 2007 .'
 'HSBC articulate Unit to Book $585 Million Charge on Settlement'
 'RISING costs have forced packaging producer to axe 90 jobs at its Hampshire manufacturing plant .']
868875 882845
False
19253
235
62.1957766642806


In [88]:
class customModel(BaseEstimator, TransformerMixin):
    def __init__(self, batch_size):
        self.model_fn = classification_model()
        self.batch_size = batch_size
        self.model = self.model_fn
    
    def fit(self, X, y):
        
        with tf.device('/device:GPU:0'):
            self.model.fit(X, y, epochs = 7, batch_size=self.batch_size, callbacks = [checkpoint_callback, early_stopping_callback], verbose = 1)
        return self
    
    def predict(self, X):
        return self.model.predict(X)


In [21]:
print(final_data.shape)
print(final_data.head())
final_data = augmentDataFrame(final_data)
print(final_data.shape)

(13970, 2)
                                                Text  Sentiment
0  Kickers on my watchlist XIDE TIT SOQ PNK CPW B...          1
1  user: AAP MOVIE. 55% return for the FEA/GEED i...          1
2  user I'd be afraid to short AMZN - they are lo...          1
3                                  MNTA Over 12.00            1
4                                   OI  Over 21.37            1
(16525, 2)


In [22]:
pos_count = final_data['Sentiment'].sum()
neg_count = (final_data['Sentiment'] == 0).sum()
print(pos_count)
print(neg_count)
print(neg_count / pos_count)

8262
8263
1.0001210360687485


In [None]:
model = customModel(8)
X = final_data['Text']
y = pd.get_dummies(final_data['Sentiment'])

tokenizer = Tokenizer(num_words=18364, split = ' ')

tokenizer.fit_on_texts(x)

pipeline = Pipeline(steps=[('textTransform', textTransformer(tokenizer = tokenizer)), ('model', model)])

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state=20, stratify=y)

pos_count = y_train.sum()
print('Length: ', len(y_train))
print('count: ', pos_count)

model.model_fn.summary()

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Assuming you have the loaded model and testing data
# model = your_loaded_model
# X_test = testing_text_inputs
# y_test = true_sentiment_labels
# print(y_train)
# pipeline.named_steps['model'].model.load_weights('final1/weights-improvement-05-0.1616.hdf5')
y_pred = pipeline.predict(X_test)

# print(conf_matrix)

In [93]:
# Convert the predictions to binary values based on a threshold (e.g., 0.5)
print(y_pred)

y_pred_binary = (y_pred[:, 1] > 0.5).astype(int)
y_test_binary = np.argmax(y_test, axis=1)
print(y_pred_binary)
# Calculate evaluation metrics
accuracy = accuracy_score(y_test_binary, y_pred_binary)
precision = precision_score(y_test_binary, y_pred_binary)
recall = recall_score(y_test_binary, y_pred_binary)
f1 = f1_score(y_test_binary, y_pred_binary)
conf_matrix = confusion_matrix(y_test_binary, y_pred_binary)

# Print the evaluation metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

[[9.4601979e-05 9.9990535e-01]
 [5.1780853e-06 9.9999487e-01]
 [1.7897461e-03 9.9821031e-01]
 ...
 [7.2354879e-07 9.9999928e-01]
 [9.9999702e-01 3.0022361e-06]
 [9.6729982e-01 3.2700174e-02]]
[1 1 1 ... 1 0 0]
Accuracy: 0.8444937474788221
Precision: 0.8505747126436781
Recall: 0.835820895522388
F1 Score: 0.8431332655137335
Confusion Matrix:
[[2115  364]
 [ 407 2072]]


In [119]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8444937474788221
Precision: 0.8505747126436781
Recall: 0.835820895522388
F1 Score: 0.8431332655137335
Confusion Matrix:
[[2115  364]
 [ 407 2072]]


In [115]:
import pickle
import os

def save_pipeline_keras(model,folder_name="model"):
    os.makedirs(folder_name, exist_ok=True)
    # print(model.named_steps['transformText'])
    # model.named_steps['model'].model = None
    # dump(model, 'pipeline.pkl')
    pickle.dump(model.named_steps['textTransform'], open(folder_name+'/'+'textTransformer.pkl','wb'))
    pickle.dump(tokenizer, open(folder_name + '/' + 'tokenizer.pkl', 'wb'))
    model.named_steps['model'].model.save(folder_name+'/model.h5')
    # pickle.dump(model.named_steps['model'].model, open(folder_name + '/' + 'model.h5', 'wb'))

In [116]:
save_pipeline_keras(pipeline, 'finalPipeline')

In [3]:
import dill as pickle

def load_pipeline_keras(cleaner, model, tokenizer, folder_name="model"):
    cleaner = pickle.load(open(folder_name+'/'+cleaner,'rb'))
    tokenizerFinal = pickle.load(open(folder_name+'/'+tokenizer,'rb'))
    model = keras.models.load_model(folder_name+'/'+model)
    cleaner.tokenizer = tokenizerFinal
    # classifier = KerasClassifier(model=build_model, epochs=1, batch_size=10, verbose=1)
    # classifier.classes_ = pickle.load(open(folder_name+'/'+classes,'rb'))
    # classifier.model = build_model
    # build_model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return Pipeline([
        ('textTransformer', cleaner),
        ('model', model)
    ])

In [None]:
loaded_pipeline = load_pipeline_keras('textTransformer.pkl', 'model.h5', 'tokenizer.pkl', 'server/model/classifier')

loaded_pipeline.predict(['It was terrible'])