In [62]:
import pandas as pd
import gensim
from nltk.tokenize.casual import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [49]:
df = pd.read_csv('msft_tweets - msft_tweets.csv')
df.head()

Unnamed: 0,date,message,sentiment
0,2014-01-01,"""@BenedictEvans shocking that $MSFT is missing...",2.0
1,2014-01-01,"""RT @ACInvestorBlog: Stocks to Watch for Janua...",1.0
2,2014-01-01,"""Stocks to Watch for January 2, 2014 http:\/\/...",1.0
3,2014-01-01,"""Dow #Stocks Trend $AXP $UTX $CSCO $KO $HD $DI...",0.0
4,2014-01-02,"""Microsoft Corporation : Microsoft Assigned Pa...",1.0


In [60]:
tokenizer  = TweetTokenizer()
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
porter = PorterStemmer()
stop_words = set(stopwords.words('english')) 

def clean1(message):
    message = message.lower()
    message = re.sub(r"[^A-Za-z0-9]", " ", message)
    message = message.split()
    message = [stemmer.stem(word) for word in message]
    return message

def clean2(message):
    #replace urls by URL
    message = re.sub('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', ' URL ', message)
    #replace @user by USER
    message = re.sub('(@[A-Za-z0-9]+)',' USER ', message)
    #remove hashtags
    message = message.replace('#','')
    message = message.replace('$','')
    message = re.sub(r"[^A-Za-z0-9]", " ", message)
    message = message.split()
    message = [word for word in message if word not in stop_words]
    message = [porter.stem(word) for word in message]

In [61]:
message_list = df['message'].tolist()
cleaned_message_list = [clean2(message) for message in message_list]

# build vocabulary and train model
model = gensim.models.Word2Vec(
    cleaned_message_list,
    size=300,
    window=10,
    min_count=1,
    workers=10)
model.train(cleaned_message_list, total_examples=len(cleaned_message_list), epochs=30)

model.save('stockTweetsEmbedding')

TypeError: 'NoneType' object is not iterable

In [5]:
embeddedlist = [sum([model.wv.word_vec(word) for word in message]) for message in cleaned_message_list]

In [35]:
labelled_data = embeddedlist[:1250]
sentiment_list = df['sentiment'].tolist()
labels = [int(sentiment) for sentiment in (sentiment_list[:1250])]

In [46]:
X_train, X_test, y_train, y_test = train_test_split(labelled_data, labels, test_size=0.1)
parameters = {'max_depth':[2,20], 'min_samples_leaf':[1,5],'criterion':('gini','entropy')}#,, {'n_estimators':[10,100],'criterion':('gini','entropy')}#, 'max_depth':[2,20]}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, parameters, cv=5)

for i in range(10):
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    print(accuracy_score(y_test, predictions))
    

    #n_estimators=’warn’, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None

#{'kernel':('linear', 'rbf'), 'C':[1, 10]}

0.504
0.52
0.528
0.44
0.552
0.552
0.504
0.568
0.496
0.536


In [56]:
byday_df = pd.DataFrame(columns={'date','sentiment'})
byday_df['date'] = df['date']
byday_df['sentiment'] = clf.predict(embeddedlist)
byday_df.head()

Unnamed: 0,sentiment,date
0,2,2014-01-01
1,1,2014-01-01
2,1,2014-01-01
3,0,2014-01-01
4,1,2014-01-02


In [57]:
svm_input = pd.read_csv('prediction_targets.csv')
svm_input.head()

Unnamed: 0,Date,target
0,2014-01-02,0
1,2014-01-03,0
2,2014-01-06,0
3,2014-01-07,1
4,2014-01-08,0


In [58]:
import datetime

dates = svm_input['Date'].tolist()
positive_sent = []
neutral_sent = []
negative_sent = []

for date in dates:
    date = datetime.datetime.strptime(date, '%Y-%m-%d')
    oneday_before = str(date-datetime.timedelta(1)).split(' ')[0]
    twodays_before = str(date-datetime.timedelta(2)).split(' ')[0]
    threedays_before = str(date-datetime.timedelta(3)).split(' ')[0]
    days_before = [oneday_before, twodays_before, threedays_before]
    sentiments = byday_df.loc[byday_df['date'].isin(days_before)]['sentiment'].tolist()
    positive_sent.append(sentiments.count(1))
    neutral_sent.append(sentiments.count(0))
    negative_sent.append(sentiments.count(2))

svm_input['positive'] = positive_sent
svm_input['neutral'] = neutral_sent
svm_input['negative'] = negative_sent
svm_input.head()

Unnamed: 0,Date,target,positive,neutral,negative
0,2014-01-02,0,2,1,1
1,2014-01-03,0,3,3,4
2,2014-01-06,0,3,2,5
3,2014-01-07,1,2,0,6
4,2014-01-08,0,2,3,6


In [59]:
svm_input.to_csv('svm_input.csv', encoding='utf-8', index=False)