In [1]:
import pandas as pd
df = pd.read_pickle("preprocessed_labeled.pkl")
df = df[['clean','sentiment']]
# 2(News): the tweet links to factual news about climate change
# 1(Pro): the tweet supports the belief of man-made climate change
# 0(Neutral: the tweet neither supports nor refutes the belief of man-made climate change
# -1(Anti): the tweet does not believe in man-made climate change, 3990

In [2]:
# Make positive, fact and neutral to be 0
df['sentiment'] = df['sentiment'].replace(1,0)
df['sentiment'] = df['sentiment'].replace(2,0)

In [3]:
df['sentiment'] = df['sentiment'].replace(0,1) # 1 is POSITIVE
df['sentiment'] = df['sentiment'].replace(-1,0) # 0 is NEGATIVE

In [4]:
# Include all negatives
negatives = df[df.index.isin(range(40000,50000))]
negatives = negatives[negatives['sentiment']==0]
df_1 = df[0:40000]
df_2 = df[40000:]
frames = [df_1,negatives,df_2]
df = pd.concat(frames)

In [5]:
df = df.reset_index()

In [7]:
df = df.drop(['index'], axis=1)

In [8]:
# Add vader columns
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

compound, neg, pos = [], [], []
for text in df['clean']:
    compound.append(sia.polarity_scores(text)['compound'])
    neg.append(sia.polarity_scores(text)['neg'])
    pos.append(sia.polarity_scores(text)['pos'])
df['compound'] = compound
df['neg'] = neg
df['pos'] = pos

In [9]:
# Add polarity and subjectivity
from textblob import TextBlob

polarity, subjectivity = [], []
for text in df['clean']:
    polarity.append(TextBlob(text).sentiment.polarity)
    subjectivity.append(TextBlob(text).sentiment.subjectivity)
df['polarity'] = polarity
df['subjectivity'] = subjectivity

In [10]:
len(df[df['sentiment']==0]) # 4438

4438

In [10]:
# Doc2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

# List of sentences
doc = df["clean"]
# Tokenization of each document
tokenized_doc = []
for d in doc:
    tokenized_doc.append(word_tokenize(d.lower()))
    
# Add unlabaled documents
unlabeled = pd.read_pickle("preprocessed.pkl")
doc = unlabeled["base"]
# Tokenization of each unlabeled document
for d in doc:
    tokenized_doc.append(word_tokenize(d.lower()))

tagged_data = [TaggedDocument(d, [i]) for i, d in enumerate(tokenized_doc)]

## Train doc2vec model
d2v_model = Doc2Vec(tagged_data, vector_size = 100, window = 2, min_count = 1, epochs = 100)

In [14]:
# # Save the model
# from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# d2v_model.save("d2v_model.mod")

In [13]:
# Load the model
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
d2v_model = Doc2Vec.load("d2v_model.mod")

In [16]:
# Add Doc2Vec in df_features
import numpy as np
from nltk.tokenize import word_tokenize

df['vectors'] = df.clean.apply(lambda x: d2v_model.infer_vector(word_tokenize(x)))

In [17]:
# Add each in one vector
features = []
for index, row in df.iterrows():
    featuresRow = []
    for column in df.columns:
        if column == 'clean' or column == 'sentiment': continue
        if column == 'vectors': 
            for i in list(row[column]):
                featuresRow.append(i)
            continue
        featuresRow.append(row[column])
    features.append(featuresRow)

In [18]:
# Create the pandas DataFrame
df_features = pd.DataFrame(features)

In [19]:
# Solve imbalanced data with SLOVE
from imblearn.over_sampling import SMOTE
X = features[0:40500]
y = df[0:40500]['sentiment']
print(len(X), len(y))
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
print(len(X), len(y))

40500 40500
73002 73002


In [20]:
from random import shuffle

train_x = np.array(X)
train_y = np.array(y)
test_x = np.array(features[40500:])
test_y = np.array(df['sentiment'][40500:])

In [21]:
train_x

array([[ 4.21500000e-01,  0.00000000e+00,  3.22000000e-01, ...,
         2.66967773e-01,  7.75308162e-02,  4.17755879e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -2.73391753e-02,  9.31520164e-02, -1.45533934e-01],
       [ 8.02000000e-01,  0.00000000e+00,  5.45000000e-01, ...,
         6.28240228e-01,  3.39290112e-01,  5.51127255e-01],
       ...,
       [ 0.00000000e+00,  1.82861234e-01,  1.82861234e-01, ...,
         1.08758602e-01,  1.91381307e-01,  5.92593369e-01],
       [-5.02577589e-03,  1.61710071e-01,  6.88201330e-02, ...,
         3.68425299e-02,  1.45305219e-01, -2.13320181e-01],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         8.01758926e-02,  4.43761046e-04,  6.12313937e-02]])

In [22]:
# Sequential
from keras import models
from keras import layers
# Train a neural network
model = models.Sequential()
# Input - Layer
model.add(layers.Dense(50, activation = "relu", input_shape=(105, )))
# Hidden - Layers
model.add(layers.Dropout(0.3, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
model.add(layers.Dropout(0.2, noise_shape=None, seed=None))
model.add(layers.Dense(50, activation = "relu"))
# Output- Layer
model.add(layers.Dense(1, activation = "sigmoid"))
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                5300      
                                                                 
 dropout (Dropout)           (None, 50)                0         
                                                                 
 dense_1 (Dense)             (None, 50)                2550      
                                                                 
 dropout_1 (Dropout)         (None, 50)                0         
                                                                 
 dense_2 (Dense)             (None, 50)                2550      
                                                                 
 dense_3 (Dense)             (None, 1)                 51        
                                                                 
Total params: 10,451
Trainable params: 10,451
Non-traina

In [23]:
# Fit the model
model.compile(optimizer = "adam",loss = "binary_crossentropy",metrics = ["accuracy"])
results = model.fit(train_x, train_y,epochs= 10,batch_size = 32,validation_data = (test_x, test_y))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [24]:
# Evaluate the model
scores = model.evaluate(test_x, test_y, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))
print("Loss:",scores[0])

Accuracy: 79.29%


In [44]:
# Save the model
model.save("d2v_keras.h5")

In [45]:
# Classify unlabaled data
unlabaled = pd.read_pickle("preprocessed.pkl")

# Predict
def decode_sentiment(score):
    if score > 0.5: return 1
    elif score <= 0.5: return 0

def get_features(text):
    features = []
    # Add features for text
    features.append(sia.polarity_scores(text)['compound'])
    features.append(sia.polarity_scores(text)['neg'])
    features.append(sia.polarity_scores(text)['pos'])
    features.append(TextBlob(text).sentiment.polarity)
    features.append(TextBlob(text).sentiment.subjectivity)
    vector = d2v_model.infer_vector(word_tokenize(text))
    for i in vector:
        features.append(i)
    return features

def predict(features):
    scores = model.predict(np.array(features))
    return scores

negatives = []
features = []
for index, row in unlabaled.iterrows():
    vector = get_features(row["clean"])
    features.append(vector)
scores = predict(features)



In [49]:
negatives = []
for i in range(0,len(scores)):
    if scores[i]<0.5: negatives.append(i)

In [51]:
len(negatives) # 62069 negatives out of 313985 

62069

In [57]:
with open('doc2vec_keras_negatives.txt','w') as tfile:
    tfile.write(str(negatives))