In [6]:
import pandas as pd

# Load data
news_final = pd.read_csv("/Users/sumanur/Desktop/ml/octeight.csv") 
news_final = news_final.dropna()
news_final = news_final.drop(columns=['SentimentTitle', 'SentimentHeadline', 'IDLink'])
news_final = news_final.drop(columns=['LinkedIn'])
news_final = news_final.drop(columns=['Facebook'])
news_final = news_final.drop(columns=['Topics_lda'])
news_final = news_final.drop(columns=['Topics_lda_title'])

In [7]:
# Sentiment Analysis 
headlines = list(news_final['Headline'])
titles = list(news_final['Title'])
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
sia = SentimentIntensityAnalyzer()

# Adds headline sentiment to the dataframe
headline_sentiment = []
for h in headlines:
    hs = sia.polarity_scores(h)
    headline_sentiment.append(hs)
compound_headline_sentiments = []
for hs in headline_sentiment:
    compound_headline_sentiments.append(hs['compound'])
    
compound_headline_sentiments = pd.Series(compound_headline_sentiments)
news_final['HeadlineSentiment'] = compound_headline_sentiments.values

# Adds title sentiment to the dataframe
title_sentiment = []
for t in titles:
    ts = sia.polarity_scores(t)
    title_sentiment.append(ts)
compound_title_sentiments = []
for ts in title_sentiment:
    compound_title_sentiments.append(ts['compound'])
    
compound_title_sentiments = pd.Series(compound_title_sentiments)
news_final['TitleSentiment'] = compound_title_sentiments.values

In [8]:
# Remove time stamp and add 'Hour of Day' column and 'Day of Week' column
import datetime
news_dates = news_final['PublishDate']

hour_of_day = []
for date in news_dates:
    news_date =  datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    news_hour = news_date.hour
    hour_of_day.append(news_hour)
news_final['HourOfDay'] = hour_of_day
news_final['HourOfDay'] = news_final['HourOfDay']//3

day_of_week = []
for date in news_dates:
    news_date =  datetime.datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
    news_day = news_date.weekday()
    day_of_week.append(news_day)
news_final['DayOfWeek'] = day_of_week

# Discard
news_final = news_final.drop(columns=['PublishDate'])
news_final = news_final.drop(columns=['Title'])
news_final = news_final.drop(columns=['Headline'])
news_final = news_final.drop(columns=['Source'])

In [9]:
# 0 to 3 topic labels
topic_labels = []
for e in news_final['Topic']:
    if e == 'economy':
        topic_labels.append(0)
    if e == 'obama':
        topic_labels.append(1)
    if e == 'palestine':
        topic_labels.append(2)
    if e == 'microsoft':
        topic_labels.append(3)

news_final['TopicLabels'] = topic_labels
news_final = news_final.drop(columns=['Topic'])

# Scaling
news_final['HourOfDay'] = news_final['HourOfDay']/7
news_final['DayOfWeek'] = news_final['DayOfWeek']/6
news_final['TopicLabels'] = news_final['TopicLabels']/3

In [10]:
from sklearn.cluster import KMeans
import numpy as np

# Add K means column
X = np.array(news_final['GooglePlus'])
X = X.reshape(-1,1)
kk = 2
kmeans = KMeans(n_clusters=kk, random_state=0, max_iter=100, algorithm="full", ).fit(X)
news_final['FBPopMean'] = kmeans.predict(X)

# Cluster based oversampling
m0 = news_final[news_final.FBPopMean == 0]
m1 = news_final[news_final.FBPopMean == 1]

m0_final = m0
m1_final = m1

while len(m1_final) < len(m0):
    m1_final = m1_final.append(m1)

news_final_mega = m0_final.append(m1_final)
news_final = news_final_mega

In [11]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from sklearn.preprocessing import OneHotEncoder

# Prepare input for ANN
enc = OneHotEncoder(handle_unknown='ignore')
Y = enc.fit_transform(np.array(news_final['FBPopMean']).reshape(-1,1))
X = np.array(news_final.drop(columns=['GooglePlus','FBPopMean']))

# ANN model specifications
model = Sequential([
    Dense(8, input_shape=(5,)),Activation('sigmoid'),
    Dense(6),Activation('sigmoid'),
    Dense(kk),Activation('softmax'),
])
model.compile(optimizer='nadam', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [12]:
# Training the net
model.fit(X,Y, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1104d9400>

In [13]:
# Logistic Regression
xx =X
yy = news_final['FBPopMean']
yy = np.array(yy)
xx
from sklearn.model_selection import train_test_split
xx_train, xx_test, yy_train, yy_test = train_test_split(xx, yy, test_size=0.25, random_state=0)

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(max_iter=1000)
logisticRegr.fit(xx_train, yy_train)
print("Logistic Regression Accuracy = ")
logisticRegr.score(xx_test, yy_test)

Logistic Regression Accuracy = 


0.5858105772554442

In [14]:
# Correlation scores
print("Correlation Scores")
print(news_final['FBPopMean'].corr(news_final['TopicLabels']))
print(news_final['FBPopMean'].corr(news_final['HeadlineSentiment']))
print(news_final['FBPopMean'].corr(news_final['TitleSentiment']))
print(news_final['FBPopMean'].corr(news_final['HourOfDay']))
print(news_final['FBPopMean'].corr(news_final['DayOfWeek']))

Correlation Scores
0.11476229466915897
-0.12238836200963803
-0.14821643706152787
0.07493108993647242
0.019834891146101388
