In [27]:
from tensorflow.keras.models import load_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import os

In [2]:
election_data_file = os.listdir("./elec_data")
print(election_data_file)

['hashtag_donaldtrump.csv', 'hashtag_joebiden.csv']


In [4]:
Biden_filename = election_data_file[1]
Biden_path = os.path.join("./elec_data",Biden_filename)
print("open file:", Biden_path)
df_Biden = pd.read_csv(Biden_path, encoding ="ISO-8859-1", lineterminator='\n')
print("data size:", len(df_Biden))

open file: ./elec_data/hashtag_joebiden.csv
data size: 776886


In [5]:
# split a sample dataset, using smaller data to predict
drop_item=df_Biden.sample(frac=0.99,random_state=200) #random state is a seed value
Biden_sample=df_Biden.drop(drop_item.index)
print("sample biden data size:", len(Biden_sample))

sample biden data size: 7769


In [6]:
# store the sample biden data
Biden_sample.to_csv("./elec_data/hashtag_joebiden_sample.csv", index=False)

### clean the text in tweet to get pure text

In [3]:
import re
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
'''
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)
'''
def textprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        tokens.append(token)
    return " ".join(tokens)

In [11]:
Biden_sample["refine_tweet"] = Biden_sample.tweet.apply(lambda x: textprocess(x))
print(Biden_sample.refine_tweet.head(10))

59     trumpisalaughingstock realdonaldtrump at his i...
278    regulatebigtech regulatesiliconvalley google t...
285                  censroship joebiden donaldtrump2020
352    lyinbidrn family involved in cash for influenc...
425          don t be a chump vote trump biden is a liar
428    not looking good for biden with hunter embroil...
446    get rid of the children biden bidencrimefamily...
491    no don t do it that s the problem 2 many peopl...
568               joebiden hunterbiden hunterbidenemails
618    biden bidenharris2020 bidenharris biden2020 am...
Name: refine_tweet, dtype: object


In [12]:
# store the cleaned tweet in csv
Biden_sample.to_csv("./elec_data/hashtag_joebiden_sample.csv", index=False)

In [4]:
# open the csv
Biden_sample = pd.read_csv("./elec_data/hashtag_joebiden_sample.csv", encoding ="ISO-8859-1", lineterminator='\n')
print("open file....")

open file....


In [34]:
def predict(text):
    
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=300)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    #label = -1 if score < 0.5 else 1
    out_score = round(float(score),4)

    return out_score

In [8]:
model = load_model("model_epoc3.h5")

In [10]:
import pickle
# loading tokenizer
with open('tokenizer.pkl', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [21]:
print(predict("I love u so much"))

(1, 0.9252)


In [22]:
print(predict("trump biden let s be real we all know it won t matter in a few months because 2020 can only end in one way aliens"))

(-1, 0.4293)


In [23]:
predict("I'm going under and this time I fear there's no one to save me")

(-1, 0.3586)

## Try to evaluate model with total dataset
### but the dataset is large, it takes 40 min or much longer. temporarily I just not skip it.

In [24]:
ALL_df = pd.read_csv("twitt_all_wo_Nu.csv", encoding ="ISO-8859-1" ,
                 lineterminator='\n')

In [25]:
decode_map = {-1: "NEGATIVE", 1: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]

In [26]:
%%time
ALL_df.VALUE = ALL_df.VALUE.apply(lambda x: decode_sentiment(x))

CPU times: user 560 ms, sys: 12.3 ms, total: 572 ms
Wall time: 607 ms


In [28]:
with open('encoder.pkl', 'rb') as f:
    encoder = pickle.load(f)

In [29]:
ALL_y_test = encoder.transform(ALL_df.VALUE.tolist())
ALL_y_test = ALL_y_test.reshape(-1,1)
print("ALL y test shape:", ALL_y_test.shape)

ALL y test shape: (2050014, 1)


In [30]:
ALL_x_test = pad_sequences(tokenizer.texts_to_sequences(ALL_df.TEXT), maxlen=300)
print("ALL x test shape", ALL_x_test.shape)

ALL x test shape (2050014, 300)


In [32]:
%%time
# evaluate
score = model.evaluate(ALL_x_test, ALL_y_test, batch_size=1024)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

  69/2002 [>.............................] - ETA: 42:41 - loss: 0.5648 - accuracy: 0.7276

KeyboardInterrupt: 

## predict the refine tweet, store in new column "predict_score"
Note that: score >= 0.5  is positive, else negative

In [36]:
%%time
Biden_sample["predict_score"] = Biden_sample.refine_tweet.apply(lambda x: predict(x))

CPU times: user 13min 1s, sys: 2min 5s, total: 15min 6s
Wall time: 8min 30s


In [35]:
predict(Biden_sample.refine_tweet[5])

0.7937

In [37]:
# store result csv
Biden_sample.to_csv("./elec_data/hashtag_joebiden_sample.csv", index=False)