In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/stock-prediction-using-sentiment-analysis/json_data_final.csv
/kaggle/input/stock-prediction-using-sentiment-analysis/train_data-1573118738755.json
/kaggle/input/stock-prediction-using-sentiment-analysis/test_data.json
/kaggle/input/stock-prediction-using-sentiment-analysis/train_factors-1573207730757.csv
/kaggle/input/stock-prediction-using-sentiment-analysis/jsonWsenti2.csv
/kaggle/input/stock-prediction-using-sentiment-analysis/test_factors.csv
/kaggle/input/test-json-files/test_json_day.csv
/kaggle/input/test-json-files/test_json.csv


## Synopsis
* Load the "train_json" and "test_json" files
* Performed Lemmitization and Tokenization
* Perfomed TF-IDF
* Built models using Logistic Regression and XGB. Chose Logistic regression as the final model
* Predicted on the "test_json" file tweets to get a sentiment score
* Aggregated the tweets per "ticker" on a day level. Predicted on the aggregated tweets on both "train_json" and "test_json" files using the eariler trained model to get overall sentiment on a day
* Created five new features "senti_0 to senti_4" which calculate the no.of tweets of each sentiment score per "ticker" on a day level
* Write the test and train files into csv format and export them to be used for final model building in naother kernel

In [2]:

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix

#!pip install imblearn
#if the above command does not work to install imblearn package run the following command in your terminal
# conda install -c glemaitre imbalanced-learn
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score

Using TensorFlow backend.


In [3]:
url1 = '/kaggle/input/stock-prediction-using-sentiment-analysis/json_data_final.csv'
url2 = '/kaggle/input/stock-prediction-using-sentiment-analysis/train_factors-1573207730757.csv'
ulr3 = '/kaggle/input/stock-prediction-using-sentiment-analysis/test_data.json'
url4 = '/kaggle/input/stock-prediction-using-sentiment-analysis/test_factors.csv'
url5 = '/kaggle/input/test-json-files/test_json.csv'
url6 = '/kaggle/input/test-json-files/test_json_day.csv'

## 1.0 Reading the train_json csv file

In [4]:
train_json = pd.read_csv(url1,index_col=[0],na_values=[' '])

  mask |= (ar1 == a)


In [5]:
train_json.isna().sum()

stocktwit_tweet        12623
sentiment_score            0
timestamp                  0
ticker                     0
tweeted_day_of_week        0
tweet_month                0
tweet_hour                 0
dtype: int64

In [6]:
train_json.dropna(subset=['stocktwit_tweet'], inplace=True)
train_json.reset_index(drop=True, inplace=True)
train_json.isna().sum()

stocktwit_tweet        0
sentiment_score        0
timestamp              0
ticker                 0
tweeted_day_of_week    0
tweet_month            0
tweet_hour             0
dtype: int64

In [7]:
train_json['timestamp'] = pd.to_datetime(train_json['timestamp']).dt.date

train_json.head(2)

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker,tweeted_day_of_week,tweet_month,tweet_hour
0,going up but hesitating however chart is very...,3,2018-09-19,$AMD,Wednesday,September,18
1,despite china trade war held very well thumbs up,3,2018-10-09,$CAT,Tuesday,October,3


In [8]:
print(train_json["timestamp"].min())
print(train_json["timestamp"].max())

2018-07-01
2018-10-31


In [9]:
#Dropping extra columns
train_json = train_json.drop(['tweeted_day_of_week','tweet_month','tweet_hour'],axis=1)
train_json.head()

Unnamed: 0,stocktwit_tweet,sentiment_score,timestamp,ticker
0,going up but hesitating however chart is very...,3,2018-09-19,$AMD
1,despite china trade war held very well thumbs up,3,2018-10-09,$CAT
2,wtf,2,2018-07-12,$AVGO
3,new insider filing on muller klaus peter tran...,2,2018-07-19,$PH
4,if it bounces tommorrow do the right thing an...,3,2018-08-23,$FB


### 1.1 Train_Json Lemmitizing and tokenizing

In [10]:
import spacy #load spacy
nlp = spacy.load("en", disable=['parser', 'tagger', 'ner'])
stops = spacy.lang.en.stop_words.STOP_WORDS



def normalize(comment, lowercase, remove_stopwords):
    if lowercase:
        comment = comment.lower()
    comment = nlp(comment)
    lemmatized = list()
    for word in comment:
        lemma = word.lemma_.strip()
        if lemma:
            if not remove_stopwords or (remove_stopwords and lemma not in stops):
                lemmatized.append(lemma)
    return " ".join(lemmatized)


train_json['stocktwit_tweet'] = train_json['stocktwit_tweet'].apply(normalize, lowercase=False, remove_stopwords=True)

### 1.2 Train_Json : TF IDF

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
tfidf = TfidfVectorizer(max_features=1500, stop_words='english', lowercase = False,ngram_range=(1,3))
train_json_tfidf = tfidf.fit_transform(train_json['stocktwit_tweet'])

print(train_json_tfidf.shape)

(1010474, 1500)


In [13]:
y = train_json['sentiment_score']

In [14]:
X_train, X_valid, y_train, y_valid = train_test_split(train_json_tfidf, y, train_size = 0.7, random_state = 20)
print('X Train Shape', X_train.shape)
print('y Train Shape', y_train.shape)
print('X Valid Shape', X_valid.shape)
print('y Train Shape', y_valid.shape)

X Train Shape (707331, 1500)
y Train Shape (707331,)
X Valid Shape (303143, 1500)
y Train Shape (303143,)


## 2.0 Train_json model building

### 2.1 Building Logistic Regression

In [15]:
%%time
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train, y_train)



CPU times: user 1min 16s, sys: 88 ms, total: 1min 16s
Wall time: 1min 16s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [16]:
train_pred_logreg_ = clf_logreg.predict(X_train)
val_pred_logreg_sm = clf_logreg.predict(X_valid)

f1_logreg_train = f1_score(y_true=y_train, y_pred=train_pred_logreg_,average='weighted')
f1_logreg_val = f1_score(y_true=y_valid, y_pred=val_pred_logreg_sm,average='weighted')

print("F1 score for logreg Train Data is ",f1_logreg_train)
print("F1 score for logreg Test Data is ",f1_logreg_val)
print(accuracy_score(y_true=y_train, y_pred=train_pred_logreg_))
print(accuracy_score(y_true=y_valid, y_pred=val_pred_logreg_sm))

F1 score for logreg Train Data is  0.6098100120789917
F1 score for logreg Test Data is  0.6089133676812234
0.6329172622152853
0.6316358946107943


### 2.2 Building XGBoost

In [17]:
X_train.shape

(707331, 1500)

In [18]:

%%time

clf_XGB = XGBClassifier(random_state=123, n_jobs=-1)

param_grid_XGB = {'classifier__colsample_bytree':[0.8],
                  'classifier__n_estimators':[800],
                  'classifier__max_depth':[15],
                  'classifier__learning_rate':[0.1],
                  'classifier__subsample':[0.9]
                 } 

grid_XGB = GridSearchCV(clf_XGB, param_grid=param_grid_XGB, cv=5)

grid_XGB.fit(X_train, y_train)


CPU times: user 44min 55s, sys: 4.02 s, total: 44min 59s
Wall time: 11min 41s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=-1,
                                     nthread=None, objective='binary:logistic',
                                     random_state=123, reg_alpha=0,
                                     re...1, scale_pos_weight=1,
                                     seed=None, silent=None, subsample=1,
                                     verbosity=1),
             iid='warn', n_jobs=None,
             param_grid={'classifier__colsample_bytree': [0.8],
                         'classifier__learning_rate': [0.1],
        

In [19]:
train_pred_XGB = grid_XGB.predict(X_train)
val_pred_XGB = grid_XGB.predict(X_valid)

f1_XGB_train = f1_score(y_true=y_train, y_pred=train_pred_XGB,average='weighted')
f1_XGB_val = f1_score(y_true=y_valid, y_pred=val_pred_XGB,average='weighted')

print("F1 score for logreg Train Data is ",f1_XGB_train)
print("F1 score for logreg Test Data is ",f1_XGB_val)
print(accuracy_score(y_true=y_train, y_pred=train_pred_XGB))
print(accuracy_score(y_true=y_valid, y_pred=val_pred_XGB))

F1 score for logreg Train Data is  0.5365891464942166
F1 score for logreg Test Data is  0.5373480081991717
0.5847630034594836
0.5852320521997869


## 3.0 Train_Json :Aggregating the tweets at a day level 

In this section, tweets are aggregated on a day level per ticker and five new features are created that the capture the no.of tweets per sentiment

In [20]:
train_json["_dummy"]=1
df2=train_json.pivot_table(index=["timestamp", "ticker"], columns="sentiment_score", values="_dummy", aggfunc="sum").fillna(0)
df2.head()

Unnamed: 0_level_0,sentiment_score,0,1,2,3,4
timestamp,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0
2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0
2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0
2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0
2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0


In [21]:
df3=train_json.pivot_table(index=["timestamp", "ticker"], values="stocktwit_tweet", aggfunc="sum")
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,stocktwit_tweet
timestamp,ticker,Unnamed: 2_level_1
2018-07-01,$AABA,option volume x normal friday contract volume ...
2018-07-01,$AAL,s undervalue lowweek target zone markmonthly c...
2018-07-01,$AAP,short sale volume short interest shortvolumes
2018-07-01,$AAPL,warren diggered exitright angle apple lol craz...
2018-07-01,$ABBV,bullish bat bullish bat bullish divergencewond...


In [22]:
res=pd.concat([df2, df3], axis=1)
res.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,stocktwit_tweet
timestamp,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0,option volume x normal friday contract volume ...
2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0,s undervalue lowweek target zone markmonthly c...
2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0,short sale volume short interest shortvolumes
2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0,warren diggered exitright angle apple lol craz...
2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0,bullish bat bullish bat bullish divergencewond...


In [23]:
train_json_day = pd.DataFrame(res.to_records())
train_json_day.head()

Unnamed: 0,timestamp,ticker,0,1,2,3,4,stocktwit_tweet
0,2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0,option volume x normal friday contract volume ...
1,2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0,s undervalue lowweek target zone markmonthly c...
2,2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0,short sale volume short interest shortvolumes
3,2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0,warren diggered exitright angle apple lol craz...
4,2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0,bullish bat bullish bat bullish divergencewond...


In [24]:
train_json_day.columns = ['date','ticker','senti_0','senti_1','senti_2','senti_3','senti_4','tweet']
train_json_day.head()

Unnamed: 0,date,ticker,senti_0,senti_1,senti_2,senti_3,senti_4,tweet
0,2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0,option volume x normal friday contract volume ...
1,2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0,s undervalue lowweek target zone markmonthly c...
2,2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0,short sale volume short interest shortvolumes
3,2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0,warren diggered exitright angle apple lol craz...
4,2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0,bullish bat bullish bat bullish divergencewond...


In [25]:
train_json_day['word_count'] = [len(text.split(' ')) for text in train_json_day['tweet']]
train_json_day.head()

Unnamed: 0,date,ticker,senti_0,senti_1,senti_2,senti_3,senti_4,tweet,word_count
0,2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0,option volume x normal friday contract volume ...,8
1,2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0,s undervalue lowweek target zone markmonthly c...,28
2,2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0,short sale volume short interest shortvolumes,6
3,2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0,warren diggered exitright angle apple lol craz...,253
4,2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0,bullish bat bullish bat bullish divergencewond...,16


In [26]:
train_json_day.loc[1,'tweet' ]

's undervalue lowweek target zone markmonthly chart support hope rebindb enterprise value vs b market cap difference actual asset vs market value big spread way undervaluelook way oversell'

In [27]:
train_json_day.shape

(64482, 9)

### 3.1 Train_Json aggregated file : Pre-Processing : Lemitization and tokenizing

In [28]:
train_json_day['tweet'] = train_json_day['tweet'].apply(normalize, lowercase=False, remove_stopwords=True)

In [29]:
train_json_day.head()

Unnamed: 0,date,ticker,senti_0,senti_1,senti_2,senti_3,senti_4,tweet,word_count
0,2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0,option volume x normal friday contract volume ...,8
1,2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0,s undervalue lowweek target zone markmonthly c...,28
2,2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0,short sale volume short interest shortvolumes,6
3,2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0,warren diggered exitright angle apple lol craz...,253
4,2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0,bullish bat bullish bat bullish divergencewond...,16


### 3.2 Train_Json_Agg Prediction

In [30]:
train_json_agg = tfidf.transform(train_json_day['tweet'])

In [31]:
train_json_day["senti_train"] = clf_logreg.predict(train_json_agg)

In [32]:
train_json_day['senti_train'].value_counts()

2    40334
1     8196
4     6818
3     4952
0     4182
Name: senti_train, dtype: int64

#### Sentiment analysis using text blob

In [33]:
from textblob import *

In [34]:
def get_tweet_sentiment(tweet): 
    
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(tweet) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return('positive')
    elif analysis.sentiment.polarity == 0: 
        return ('neutral')
    else: 
        return ('negative')

In [35]:
train_json_day['Senti_blob'] = train_json_day['tweet'].apply(get_tweet_sentiment)

In [36]:
train_json_day.head()

Unnamed: 0,date,ticker,senti_0,senti_1,senti_2,senti_3,senti_4,tweet,word_count,senti_train,Senti_blob
0,2018-07-01,$AABA,0.0,0.0,1.0,0.0,0.0,option volume x normal friday contract volume ...,8,2,positive
1,2018-07-01,$AAL,0.0,1.0,1.0,2.0,1.0,s undervalue lowweek target zone markmonthly c...,28,3,neutral
2,2018-07-01,$AAP,0.0,1.0,0.0,0.0,0.0,short sale volume short interest shortvolumes,6,1,neutral
3,2018-07-01,$AAPL,1.0,2.0,27.0,5.0,2.0,warren diggered exitright angle apple lol craz...,253,4,positive
4,2018-07-01,$ABBV,2.0,0.0,0.0,0.0,1.0,bullish bat bullish bat bullish divergencewond...,16,0,neutral


In [37]:
train_json_day.to_csv("json_train_senti25.csv")

## 4.0 Reading and preprocessing the  test_json data

In [38]:
test_json_act = pd.read_csv(url5,index_col=[0],na_values=[' '])

In [39]:
test_json_act.head()

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25,$CELG
1,yall exhaust your buyer on first green candle...,2018-07-13,$AMD
2,day traders day,2018-09-25,$AMD
4,weak price action so far today do not be afra...,2018-07-31,$MU
5,continues to grow specifically in key areas l...,2018-08-04,$AMZN


In [40]:
test_json_act.isna().sum()

stocktwit_tweet    3990
timestamp             0
ticker                0
dtype: int64

In [41]:
test_json_act.dropna(subset=['stocktwit_tweet'], inplace=True)
test_json_act.reset_index(drop=True, inplace=True)
test_json_act.isna().sum()

stocktwit_tweet    0
timestamp          0
ticker             0
dtype: int64

In [42]:
test_json_act['timestamp'] = pd.to_datetime(test_json_act['timestamp']).dt.date

test_json_act.head(2)

Unnamed: 0,stocktwit_tweet,timestamp,ticker
0,nothing to be exited about,2018-10-25,$CELG
1,yall exhaust your buyer on first green candle...,2018-07-13,$AMD


In [43]:
print(test_json_act["timestamp"].min())
print(test_json_act["timestamp"].max())

2018-07-01
2018-10-31


In [44]:
test_json_act['stocktwit_tweet'] = test_json_act['stocktwit_tweet'].apply(normalize, lowercase=False, remove_stopwords=True)

In [45]:
test_json_act_tfidf = tfidf.transform(test_json_act['stocktwit_tweet'])

### 4.1 Predicting on the test_json individual tweets

In [46]:
test_json_act["senti_score"] = clf_logreg.predict(test_json_act_tfidf)

In [47]:
test_json_act.head()

Unnamed: 0,stocktwit_tweet,timestamp,ticker,senti_score
0,exit,2018-10-25,$CELG,2
1,-PRON- exhaust buyer ﻿1 green candle byeeeeee,2018-07-13,$AMD,4
2,day trader day,2018-09-25,$AMD,2
3,weak price action far today afraid short break...,2018-07-31,$MU,1
4,continue grow specifically key area like cloud...,2018-08-04,$AMZN,2


### 4.2 Predicting on test_json Day level aggregated tweets

In [48]:
test_json_act["_dummy"]=1
df2=test_json_act.pivot_table(index=["timestamp", "ticker"], columns="senti_score", values="_dummy", aggfunc="sum").fillna(0)

In [49]:
df3=test_json_act.pivot_table(index=["timestamp", "ticker"], values="stocktwit_tweet", aggfunc="sum")
df3.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,stocktwit_tweet
timestamp,ticker,Unnamed: 2_level_1
2018-07-01,$AAL,short ratio short float sunshineaveshort volum...
2018-07-01,$AAPL,bush f donald fduckgreat callout morning gjtes...
2018-07-01,$AEP,americanelectricpower little volatile market b...
2018-07-01,$AES,bull reason pay attention
2018-07-01,$ALXN,short interest ratio short float sunshineave


In [50]:
res=pd.concat([df2, df3], axis=1)
res.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,stocktwit_tweet
timestamp,ticker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2018-07-01,$AAL,3.0,0.0,0.0,0.0,0.0,short ratio short float sunshineaveshort volum...
2018-07-01,$AAPL,1.0,0.0,7.0,1.0,1.0,bush f donald fduckgreat callout morning gjtes...
2018-07-01,$AEP,0.0,0.0,1.0,0.0,0.0,americanelectricpower little volatile market b...
2018-07-01,$AES,0.0,0.0,0.0,1.0,0.0,bull reason pay attention
2018-07-01,$ALXN,1.0,0.0,0.0,0.0,0.0,short interest ratio short float sunshineave


In [51]:
test_json_day = pd.DataFrame(res.to_records())
test_json_day.head()

Unnamed: 0,timestamp,ticker,0,1,2,3,4,stocktwit_tweet
0,2018-07-01,$AAL,3.0,0.0,0.0,0.0,0.0,short ratio short float sunshineaveshort volum...
1,2018-07-01,$AAPL,1.0,0.0,7.0,1.0,1.0,bush f donald fduckgreat callout morning gjtes...
2,2018-07-01,$AEP,0.0,0.0,1.0,0.0,0.0,americanelectricpower little volatile market b...
3,2018-07-01,$AES,0.0,0.0,0.0,1.0,0.0,bull reason pay attention
4,2018-07-01,$ALXN,1.0,0.0,0.0,0.0,0.0,short interest ratio short float sunshineave


In [52]:
test_json_day['word_count'] = [len(text.split(' ')) for text in test_json_day['stocktwit_tweet']]
test_json_day.head()

Unnamed: 0,timestamp,ticker,0,1,2,3,4,stocktwit_tweet,word_count
0,2018-07-01,$AAL,3.0,0.0,0.0,0.0,0.0,short ratio short float sunshineaveshort volum...,16
1,2018-07-01,$AAPL,1.0,0.0,7.0,1.0,1.0,bush f donald fduckgreat callout morning gjtes...,64
2,2018-07-01,$AEP,0.0,0.0,1.0,0.0,0.0,americanelectricpower little volatile market b...,6
3,2018-07-01,$AES,0.0,0.0,0.0,1.0,0.0,bull reason pay attention,4
4,2018-07-01,$ALXN,1.0,0.0,0.0,0.0,0.0,short interest ratio short float sunshineave,6


In [53]:
test_json_day.columns = ['date','ticker','senti_0','senti_1','senti_2','senti_3','senti_4','tweet','word_count']
test_json_day.head(2)

Unnamed: 0,date,ticker,senti_0,senti_1,senti_2,senti_3,senti_4,tweet,word_count
0,2018-07-01,$AAL,3.0,0.0,0.0,0.0,0.0,short ratio short float sunshineaveshort volum...,16
1,2018-07-01,$AAPL,1.0,0.0,7.0,1.0,1.0,bush f donald fduckgreat callout morning gjtes...,64


In [54]:
test_json_day['Senti_blob'] = test_json_day['tweet'].apply(get_tweet_sentiment)

In [55]:
tfidf_temp = tfidf.transform(test_json_day['tweet'])

In [56]:
test_json_day['senti_train'] = clf_logreg.predict(tfidf_temp)

In [57]:
test_json_day = test_json_day.drop(["tweet"], axis = 1)

In [58]:
test_json_day.head(2)

Unnamed: 0,date,ticker,senti_0,senti_1,senti_2,senti_3,senti_4,word_count,Senti_blob,senti_train
0,2018-07-01,$AAL,3.0,0.0,0.0,0.0,0.0,16,positive,0
1,2018-07-01,$AAPL,1.0,0.0,7.0,1.0,1.0,64,negative,4


In [59]:
test_json_day.shape

(38373, 10)

## 5.0 Loading Test_factors CSV file

In [60]:
test_fact_act = pd.read_csv(url4,na_values=[' '])
test_fact_act.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7
0,270007,21/07/18,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449
1,270008,05/10/18,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353
2,270009,01/10/18,$CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811
3,270010,24/10/18,$CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803
4,270011,27/07/18,$intc,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111


In [61]:
#test_fact_act.isna().sum()

In [62]:
test_fact_act['date'] = pd.to_datetime(test_fact_act['date']).dt.date

test_fact_act.head(2)

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7
0,270007,2018-07-21,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449
1,270008,2018-05-10,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353


In [63]:
test_fact_act.shape

(11575, 10)

In [64]:
test_fact_act.columns

Index(['Id', 'date', 'ticker', 'SF1', 'SF2', 'SF3', 'SF4', 'SF5', 'SF6',
       'SF7'],
      dtype='object')

In [65]:
print(test_fact_act["date"].min())
print(test_fact_act["date"].max())

2018-01-07
2018-12-10


In [66]:
#Merging the test_json and test_factors
test_merged = pd.merge(test_fact_act, test_json_day,  how='left', left_on=['date','ticker'], right_on = ['date','ticker'])

In [67]:
test_merged.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,senti_0,senti_1,senti_2,senti_3,senti_4,word_count,Senti_blob,senti_train
0,270007,2018-07-21,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449,0.0,0.0,1.0,0.0,1.0,5.0,neutral,2.0
1,270008,2018-05-10,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353,,,,,,,,
2,270009,2018-01-10,$CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811,,,,,,,,
3,270010,2018-10-24,$CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803,0.0,0.0,1.0,1.0,0.0,15.0,neutral,2.0
4,270011,2018-07-27,$intc,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111,0.0,0.0,1.0,0.0,2.0,8.0,neutral,2.0


In [68]:
test_merged.isna().sum()

Id                0
date              0
ticker            0
SF1               0
SF2               0
SF3               0
SF4               0
SF5               0
SF6               0
SF7               0
senti_0        3623
senti_1        3623
senti_2        3623
senti_3        3623
senti_4        3623
word_count     3623
Senti_blob     3623
senti_train    3623
dtype: int64

In [69]:
test_merged['date'] = pd.to_datetime(test_merged['date'])

In [70]:
test_merged['tweeted_day_of_week'] = test_merged['date'].dt.weekday_name
test_merged['tweet_month'] = test_merged['date'].dt.month_name()
test_merged.head()

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,senti_0,senti_1,senti_2,senti_3,senti_4,word_count,Senti_blob,senti_train,tweeted_day_of_week,tweet_month
0,270007,2018-07-21,$INTC,-3.062194,1.223466,1.741714,2.279266,-1.323573,-0.274912,-4.504449,0.0,0.0,1.0,0.0,1.0,5.0,neutral,2.0,Saturday,July
1,270008,2018-05-10,$CTSH,0.816263,-2.184408,0.157975,-0.264743,-0.836282,0.046276,0.826353,,,,,,,,,Thursday,May
2,270009,2018-01-10,$CB,0.401281,0.091604,0.083411,-1.147041,-0.485223,-0.60106,1.012811,,,,,,,,,Wednesday,January
3,270010,2018-10-24,$CTAS,-0.783521,1.192929,0.813831,-0.368166,-1.113656,-0.553581,-0.683803,0.0,0.0,1.0,1.0,0.0,15.0,neutral,2.0,Wednesday,October
4,270011,2018-07-27,$intc,0.796507,0.455341,0.679032,0.354336,-1.799055,0.126153,0.297111,0.0,0.0,1.0,0.0,2.0,8.0,neutral,2.0,Friday,July


In [71]:
#writing the file to be used for final prediction in another kernel
test_merged.to_csv("test_data_merged_25.csv")