In [1]:
import pandas as pd
df = pd.read_pickle("preprocessed_labeled.pkl")
#df = df[['clean','sentiment']]

# test with stemmed
df = df[['clean_nouns','sentiment']]

# 2(News): the tweet links to factual news about climate change
# 1(Pro): the tweet supports the belief of man-made climate change
# 0(Neutral: the tweet neither supports nor refutes the belief of man-made climate change
# -1(Anti): the tweet does not believe in man-made climate change, 3990

In [2]:
# Make positive, fact and neutral to be 0
df['sentiment'] = df['sentiment'].replace(1,0)
df['sentiment'] = df['sentiment'].replace(2,0)

In [3]:
df['sentiment'] = df['sentiment'].replace(0,1) # 1 is POSITIVE
df['sentiment'] = df['sentiment'].replace(-1,0) # 0 is NEGATIVE

In [4]:
# Include all negatives
negatives = df[df.index.isin(range(40000,50000))]
negatives = negatives[negatives['sentiment']==0]
df_1 = df[0:40000]
df_2 = df[40000:]
frames = [df_1,negatives,df_2]
df = pd.concat(frames)

In [5]:
df = df.reset_index()

In [6]:
df = df.drop(['index'], axis=1)

In [7]:
# Add vader columns
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound, neg, pos = [], [], []
#for text in df['clean']:
for text in df['clean_nouns']:
    compound.append(sia.polarity_scores(text)['compound'])
    neg.append(sia.polarity_scores(text)['neg'])
    pos.append(sia.polarity_scores(text)['pos'])
df['compound'] = compound
df['neg'] = neg
df['pos'] = pos

In [8]:
# Add polarity and subjectivity
from textblob import TextBlob

polarity, subjectivity = [], []
#for text in df['clean']:
for text in df['clean_nouns']:
    polarity.append(TextBlob(text).sentiment.polarity)
    subjectivity.append(TextBlob(text).sentiment.subjectivity)
df['polarity'] = polarity
df['subjectivity'] = subjectivity

In [9]:
len(df[df['sentiment']==0]) # 4438

4438

In [10]:
# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# List of sentences
#doc = df["clean"]
doc = df["clean_nouns"]
# Tokenization of each document
tokenized_doc = []
for d in doc:
    tokenized_doc.append(word_tokenize(d.lower()))
    
# Add unlabaled documents
# unlabeled = pd.read_pickle("preprocessed.pkl")
# doc = unlabeled["base"]
# # Tokenization of each unlabeled document
# for d in doc:
#     tokenized_doc.append(word_tokenize(d.lower()))

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

## Train doc2vec model
d2v_model = Doc2Vec(tagged_data, vector_size = 100, window = 2, min_count = 1, epochs = 100)

In [14]:
# # Save the model
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# d2v_model.save("d2v_model.mod")

In [10]:
# Load the model
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
d2v_model = Doc2Vec.load("d2v_model.mod")

In [11]:
# Add Doc2Vec in df_features
import numpy as np
from nltk.tokenize import word_tokenize

df['vectors'] = df.clean_nouns.apply(lambda x: d2v_model.infer_vector(word_tokenize(x)))

In [12]:
# Add each in one vector
features = []
for index, row in df.iterrows():
    featuresRow = []
    for column in df.columns:
        if column == 'clean' or column == 'clean_nouns' or column == 'stemmed' or column == 'sentiment': continue
        if column == 'vectors': 
            for i in list(row[column]):
                featuresRow.append(i)
            continue
        featuresRow.append(row[column])
    features.append(featuresRow)

In [13]:
# Create the pandas DataFrame
df_features = pd.DataFrame(features)

In [14]:
# Solve imbalanced data with SLOVE
from imblearn.over_sampling import SMOTE
from random import shuffle

X = features
y = df['sentiment']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

44391 44391
79906 79906


In [15]:
# In the first step we will split the data in training and remaining dataset
from sklearn.model_selection import train_test_split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.6, random_state = 4)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5, random_state = 4)

In [16]:
from random import shuffle

train_x = np.array(X_train)
train_y = np.array(y_train)
test_x = np.array(X_test)
test_y = np.array(y_test)
valid_x = np.array(X_valid)
valid_y = np.array(y_valid)

In [21]:
# # Solve imbalanced data with SLOVE
# from imblearn.over_sampling import SMOTE
# X = features[0:40500]
# y = df[0:40500]['sentiment']
# print(len(X), len(y))
# oversample = SMOTE()
# X, y = oversample.fit_resample(X, y)
# print(len(X), len(y))

40500 40500
73002 73002


In [22]:
# from random import shuffle

# train_x = np.array(X)
# train_y = np.array(y)
# test_x = np.array(features[40500:])
# test_y = np.array(df['sentiment'][40500:])

In [27]:
# Sequential
from keras import models
from keras import layers
# Train a neural network
model = models.Sequential()
# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(105, )))
# Hidden - Layers
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 50)                5300      
                                                                 
 dropout_4 (Dropout)         (None, 50)                0         
                                                                 
 dense_10 (Dense)            (None, 50)                2550      
                                                                 
 dense_11 (Dense)            (None, 1)                 51        
                                                                 
Total params: 7,901
Trainable params: 7,901
Non-trainable params: 0
_________________________________________________________________


In [21]:
# Fit the model
import tensorflow as tf
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
import keras

callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=3),
              EarlyStopping(monitor='val_acc', min_delta=1e-3, patience=3)]
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3),loss = "binary_crossentropy",metrics = ["accuracy",tf.keras.metrics.Precision(),tf.keras.metrics.Recall(), tf.keras.metrics.AUC()])
results = model.fit(train_x, train_y, epochs= 10, batch_size = 64 ,validation_data = (valid_x, valid_y))
                    #, callbacks=callbacks)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
# Evaluate the model CLEAN NOUNS
# loss, accuracy, precision, recall, auc_roc
scores = model.evaluate(test_x, test_y, verbose=0)
print(scores)

scores = model.predict(test_x)

from sklearn import metrics
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

y_test = test_y
y_pred = []
for score in scores:
    if score >= 0.5: y_pred.append(1)
    else: y_pred.append(0)
print(metrics.classification_report(y_test, y_pred))
print(matthews_corrcoef(y_test, y_pred))
print(f1_score(y_test, y_pred, average='macro'))

[0.2732289731502533, 0.8917531967163086, 0.9038262963294983, 0.8764101266860962, 0.953108012676239]
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      8004
           1       0.90      0.88      0.89      7978

    accuracy                           0.89     15982
   macro avg       0.89      0.89      0.89     15982
weighted avg       0.89      0.89      0.89     15982

0.783854771348984
0.8917227753791328


In [33]:
# Save the model
model.save("d2v_keras.h5")

In [31]:
# Classify unlabaled data
unlabaled = pd.read_pickle("preprocessed.pkl")

# Predict
def decode_sentiment(score):
    if score > 0.5: return 1
    elif score <= 0.5: return 0

def get_features(text):
    features = []
    # Add features for text
    features.append(sia.polarity_scores(text)['compound'])
    features.append(sia.polarity_scores(text)['neg'])
    features.append(sia.polarity_scores(text)['pos'])
    features.append(TextBlob(text).sentiment.polarity)
    features.append(TextBlob(text).sentiment.subjectivity)
    vector = d2v_model.infer_vector(word_tokenize(text))
    for i in vector:
        features.append(i)
    return features

def predict(features):
    scores = model.predict(np.array(features))
    return scores

negatives = []
features = []
for index, row in unlabaled.iterrows():
    vector = get_features(row["clean_nouns"])
    features.append(vector)
scores = predict(features)



In [32]:
negatives = []
for i in range(0,len(scores)):
    if scores[i]<0.5: negatives.append(i)

In [33]:
len(negatives) # 62069 negatives out of 313985 

51751

In [34]:
with open('doc2vec_keras_negatives.txt','w') as tfile:
    tfile.write(str(negatives))