In [24]:
import pandas as pd

comments = pd.read_csv("/Users/aadeesh/redditSentiment/server/Data/redditData/Posts/post.csv")
len(comments.index)

66457

In [25]:
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D, Bidirectional, Flatten, BatchNormalization
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import re
import numpy as np

def classification_model():
    # Building our model
    model = keras.Sequential()
    model.add(Embedding(18364, 256, input_length = 235))
    model.add(SpatialDropout1D(0.5))
    
    model.add(Bidirectional(LSTM(units=128, dropout=0.6)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))

    model.add(Dense(2,activation='softmax'))


    model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return model

checkpoint_path = "final1/weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
# Create a ModelCheckpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    checkpoint_path,
    monitor='loss',
    save_weights_only=False,
    save_best_only=True,
    verbose=1
)

# Create an EarlyStopping callback to stop training if validation loss doesn't improve
early_stopping_callback = EarlyStopping(
    monitor='loss',
    patience=5,  # Number of epochs with no improvement after which training will stop
    verbose=1
)


class customModel(BaseEstimator, TransformerMixin):
    def __init__(self, batch_size):
        self.model_fn = classification_model()
        self.batch_size = batch_size
        self.model = self.model_fn
    
    def fit(self, X, y):
        
        with tf.device('/device:GPU:0'):
            self.model.fit(X, y, epochs = 7, batch_size=self.batch_size, callbacks = [checkpoint_callback, early_stopping_callback], verbose = 1)
        return self
    
    def predict(self, X):
        return self.model.predict(X)

def commentCleaner(comments):
    cleaned_comments = []
    for comment in comments:
        # Remove special symbols, emojis, reddit username mentions, and hyperlinks
        comment = re.sub(r"[^\w\s]|http\S+|www\S+|u/[A-Za-z0-9_-]+", "", comment)
        comment = comment.lower()
        # Tokenize the comment
        tokens = comment.split()
        # tokens = comment.split(' ')
        # Remove stop words
        stop_words = set(stopwords.words("english"))
        tokens = [token for token in tokens if token not in stop_words]
        
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
        # Join the tokens back into a single string
        cleaned_comment = " ".join(tokens)
        cleaned_comments.append(cleaned_comment)   
    return cleaned_comments


    
def tokenizeComments(comments, tokenizer):
    # print("Comments recieved for tokenization: ")
    # print(comments)
    # print("Fitted tokenizer to sample texts")
    tokenized_comments = tokenizer.texts_to_sequences(comments)
    # print("Converted to sequences")
    tokenized_comments = pad_sequences(tokenized_comments, 235)
    # print("Padded succesfully")
    # print(tokenized_comments)
    return tokenized_comments

class textTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def fit(self, X, y=None):
        # print("Starting fitting")
        return self
    
    def transform(self, X, y=None):
        # print("Starting transform")
        # print(X)
        # tokenizerFinal = Tokenizer(num_words=1000, split=' ') 
        # print(cleaned_data['Sentence'].values)
        # tokenizerFinal.fit_on_texts(cleaned_data['Sentence'].values)
        X_cleaned = commentCleaner(X)
        # print("Cleaned comments")
        # print("Starting tokenization")
        X_tokenized = tokenizeComments(X_cleaned, self.tokenizer)
        # print("Tokenized")
        # print("Ending transform")

        return X_tokenized
    


In [26]:
import dill as pickle

def load_pipeline_keras(cleaner, model, tokenizer, folder_name="model"):
    cleaner = pickle.load(open(cleaner,'rb'))
    tokenizerFinal = pickle.load(open(tokenizer,'rb'))
    model = keras.models.load_model(model)
    cleaner.tokenizer = tokenizerFinal
    # classifier = KerasClassifier(model=build_model, epochs=1, batch_size=10, verbose=1)
    # classifier.classes_ = pickle.load(open(folder_name+'/'+classes,'rb'))
    # classifier.model = build_model
    # build_model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics = ['accuracy'])
    return Pipeline([
        ('textTransformer', cleaner),
        ('model', model)
    ])


def init_model():
    classifier = load_pipeline_keras('/Users/aadeesh/redditSentiment/server/model/classifier/textTransformer.pkl', 
                    '/Users/aadeesh/redditSentiment/server/model/classifier/model.h5', 
                    '/Users/aadeesh/redditSentiment/server/model/classifier/tokenizer.pkl', 
                    'server/model/classifier')
    return classifier

classifier = init_model()

In [27]:
def dataframeProcessor(df, classifier):

    keywords = {"Tesla" : ["$tsla", "tsla", "tesla", "elon musk", "musk"],
            "Apple" : ["$aapl", "aapl", "apple", "mac", "iphone", "airpods", "macbook"], 
            "Nvidia" : ["$nvda", "nvda", "nvidia", "rtx", "geforce", "jensen", "huang"], 
            "Google" : ["$googl", "googl", "google", "alphabet", "bard", "android", "pixel", "sundar pichai", "sundar", "pichai"],
            "Amazon" : ["$amzn", "amzn", "amazon", "aws", "prime", "alexa", "fire tv", "amazon prime"],
            "Microsoft" : ["$msft", "msft", "microsoft", "windows", "azure", "xbox"],
            "Meta" : ["$meta", "meta", "instagram", "facebook", "threads"]
        }
    keywords2 = ["$tsla", "tsla", "tesla", "elon musk", "musk", 
             "$aapl", "aapl", "apple", "mac", "iphone", "airpods", "macbook"
             "$nvda", "nvda", "nvidia", "rtx", "geforce", "jensen huang", "jensen", "huang" 
             "$googl", "googl", "google", "alphabet", "bard", "android", "pixel", "sundar pichai", "sundar", "pichai"
             "$amzn", "amzn", "amazon", "aws", "prime", "alexa", "fire tv", "amazon prime"
             "$msft", "msft", "microsoft", "windows", "azure", "xbox"
             "$meta", "meta", "instagram", "facebook", "threads"
        ]

    filtered_df = df[df['Comment'].str.contains('|'.join(keywords2), case = False)]

    # Add an extra column to the filtered dataframe that indicates which keyword was present in that comment
    def keyWordBuilder(comment):
        returnString = ""
        for keyword in keywords2:
            if keyword in comment.lower():
                for key in keywords:
                    if keyword in keywords[key]:
                        if key not in returnString:
                            returnString += key + ' '
        if returnString == "":
            return "None"
        return returnString

    keyWordList = filtered_df['Comment'].apply(keyWordBuilder)

    filtered_df = filtered_df.assign(Keyword = keyWordList)

    newDates = pd.to_datetime(filtered_df['Date'])
    newDates = newDates.dt.date
    filtered_df = filtered_df.assign(Date = newDates)
    filtered_df = filtered_df.sort_values(by='Date', ascending=True)

    comments = filtered_df.Comment
    preds = classifier.predict(comments)

    sentiments = np.argmax(preds, axis = 1)
    # preds

    filtered_df = filtered_df.assign(Sentiment = sentiments)

    return filtered_df
    
    
    

In [28]:
def jsonBuilder(filtered_df):
    # filtered_rows = filtered_df[filtered_df['Keyword'].str.contains('tesla', case=False)]
    # filtered_rows['Date'] = pd.to_datetime(filtered_rows['Date'])

    # # Extract only the date part from the 'Date' column
    # filtered_rows['Date'] = filtered_rows['Date'].dt.date
    # print(filtered_rows.head())
    tesla_df, apple_df, nvda_df, google_df, amzn_df, msft_df, meta_df = {}, {}, {}, {}, {}, {}, {}

    done = []
    for i in (filtered_df.Date):
        date_string = i.strftime('%m-%d')
        if date_string not in done:
            tesla_df[date_string] = 0
            apple_df[date_string] = 0
            nvda_df[date_string] = 0
            google_df[date_string] = 0
            amzn_df[date_string] = 0
            msft_df[date_string] = 0
            meta_df[date_string] = 0
        done.append(date_string)

    for i, j, k in zip(filtered_df.Date, filtered_df.Keyword, filtered_df.Sentiment):
        date_string = i.strftime('%m-%d')
        val = 1
        if k == 0:
            val = 0
        for keyword in j.split():
            if keyword == "Tesla":
                tesla_df[date_string] += val
            if keyword == "Apple":
                apple_df[date_string] += val
            if keyword == "Nvidia":
                nvda_df[date_string] += val
            if keyword == "Google":
                google_df[date_string] += val
            if keyword == "Amazon":
                amzn_df[date_string] += val
            if keyword == "Microsoft":
                msft_df[date_string] += val
            if keyword == "Meta":
                meta_df[date_string] += val
    return [tesla_df, apple_df, nvda_df, google_df, amzn_df, msft_df, meta_df]

# l = jsonBuilder(filtered_df=filtered_df)
# for i in l:
#     print(i)


In [29]:
processed_df = dataframeProcessor(comments, classifier=classifier)
processed_df.head()



Unnamed: 0,Post ID,Title,Date,Comment,Length,Keyword,Sentiment
19923,14hw652,LOST 11k in four days and blew my account.,2023-06-24,All ya had to do was put it in Tesla and Amazo...,62,Tesla Amazon,1
40781,14hs93o,Jensen Huang finally sold some NVDA stock.,2023-06-24,Even if semis and the market continue rallying...,186,Nvidia,1
40782,14hs93o,Jensen Huang finally sold some NVDA stock.,2023-06-24,Better link to show insider trading\n\n[http:/...,521,Nvidia,1
40785,14hs93o,Jensen Huang finally sold some NVDA stock.,2023-06-24,We will see how much MM would push NVDA higher.,47,Nvidia,1
37800,14hnddl,That was fast. Indian PM gets wish granted in ...,2023-06-24,Not necessary. Amazon is basically Indian. M...,182,Amazon,1


In [30]:
jsonList = jsonBuilder(processed_df)

processed_df.to_csv('/Users/aadeesh/redditSentiment/server/Data/redditData/Posts/processed_df.csv')
for i in jsonList:
    print(i)


{'06-24': 9, '06-25': 18, '06-26': 37, '06-27': 32, '06-28': 61, '06-29': 23, '06-30': 41, '07-01': 26, '07-02': 75, '07-03': 134, '07-04': 35, '07-05': 56, '07-06': 82, '07-07': 43, '07-08': 8, '07-09': 36, '07-10': 79, '07-11': 47, '07-12': 34, '07-13': 32, '07-14': 98, '07-15': 39, '07-16': 80, '07-17': 133, '07-18': 70, '07-19': 209, '07-20': 161, '07-21': 90, '07-22': 14, '07-23': 27, '07-24': 0}
{'06-24': 0, '06-25': 9, '06-26': 18, '06-27': 18, '06-28': 34, '06-29': 32, '06-30': 110, '07-01': 33, '07-02': 27, '07-03': 40, '07-04': 21, '07-05': 27, '07-06': 26, '07-07': 12, '07-08': 10, '07-09': 7, '07-10': 24, '07-11': 6, '07-12': 10, '07-13': 20, '07-14': 18, '07-15': 12, '07-16': 11, '07-17': 19, '07-18': 8, '07-19': 49, '07-20': 12, '07-21': 12, '07-22': 10, '07-23': 23, '07-24': 4}
{'06-24': 8, '06-25': 10, '06-26': 36, '06-27': 75, '06-28': 66, '06-29': 16, '06-30': 27, '07-01': 23, '07-02': 40, '07-03': 17, '07-04': 26, '07-05': 17, '07-06': 6, '07-07': 19, '07-08': 4, '07