In [1]:
## This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/train-merged-data/train_merged_ANN.csv
/kaggle/input/data-added-on-dec25/json_train_senti25.csv
/kaggle/input/data-added-on-dec25/test_data_merged_25.csv


## Synopsis
* Loading the final merged file from test and train data
* Building a ANN

In [2]:

from sklearn import preprocessing
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import confusion_matrix

from sklearn.metrics import accuracy_score, recall_score, precision_score,f1_score

In [3]:
train_final = pd.read_csv('/kaggle/input/train-merged-data/train_merged_ANN.csv',index_col=[0])
train_final.head(2)

Unnamed: 0,Id,date,ticker,SF1,SF2,SF3,SF4,SF5,SF6,SF7,...,tweeted_day_of_week,tweet_month,senti_0,senti_1,senti_2,senti_3,senti_4,word_count,senti_train,Senti_blob
0,1,2018-08-21,$NTAP,-0.628652,0.988891,-0.055714,0.774379,0.551089,-1.329229,-0.995539,...,Tuesday,August,0.0,3.0,7.0,2.0,1.0,100.0,2.0,negative
1,2,2018-10-11,$WYNN,1.315786,1.438754,0.187327,0.608933,-1.15303,1.859441,0.730995,...,Thursday,October,1.0,2.0,8.0,3.0,1.0,100.0,4.0,positive


In [4]:
train_final['date'] = pd.to_datetime(train_final['date'])

## PCA

In [5]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
principalComponents = pca.fit_transform(train_final[['SF1','SF2','SF3','SF4','SF5','SF6','SF7']])

In [6]:
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['pc1','pc2','pc3','pc4','pc5'])

In [7]:
train_final = train_final.drop(['SF1','SF2','SF3','SF4','SF5','SF6','SF7'],axis=1)


In [8]:
train_final = pd.concat([principalDf, train_final], axis = 1)
train_final.head(2)

Unnamed: 0,pc1,pc2,pc3,pc4,pc5,Id,date,ticker,alpha,tweeted_day_of_week,tweet_month,senti_0,senti_1,senti_2,senti_3,senti_4,word_count,senti_train,Senti_blob
0,1.403016,-0.523298,1.650719,-0.131778,-0.405763,1,2018-08-21,$NTAP,2,Tuesday,August,0.0,3.0,7.0,2.0,1.0,100.0,2.0,negative
1,-0.655437,1.677647,-0.518593,2.367071,-0.70686,2,2018-10-11,$WYNN,3,Thursday,October,1.0,2.0,8.0,3.0,1.0,100.0,4.0,positive


In [9]:
#Converting to respective data types
for col in [ 'date','ticker', 'tweeted_day_of_week', 'tweet_month', 'senti_train','Senti_blob','senti_0','alpha',
            'senti_1','senti_2','senti_3','senti_4','word_count']:
    train_final[col] = train_final[col].astype('category')

In [10]:
for col in ['pc1', 'pc2', 'pc3', 'pc4', 'pc5']:
    train_final[col] = train_final[col].astype('float')

In [11]:
train_final = train_final.drop(['Id','Senti_blob','date','ticker','tweet_month','word_count'],axis=1)

In [12]:
X = train_final.drop(["alpha"], axis = 1)
Y= train_final["alpha"]

In [13]:
cat_attr = list(train_final.select_dtypes("category").columns)
num_attr = list(train_final.columns.difference(cat_attr))

print(cat_attr)
print(num_attr)
cat_attr.remove('alpha')

['alpha', 'tweeted_day_of_week', 'senti_0', 'senti_1', 'senti_2', 'senti_3', 'senti_4', 'senti_train']
['pc1', 'pc2', 'pc3', 'pc4', 'pc5']


In [14]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_attr),
        ('cat', categorical_transformer, cat_attr)])

In [15]:
clf_nm = Pipeline(steps=[('preprocessor', preprocessor)])
X_train = pd.DataFrame(clf_nm.fit_transform(X).todense())


In [16]:
X_train.shape

(27006, 77)

In [17]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,67,68,69,70,71,72,73,74,75,76
0,1.403016,-0.523298,1.650719,-0.131778,-0.405763,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.655437,1.677647,-0.518593,2.367071,-0.70686,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,2.036669,-0.288522,-0.564479,-1.396427,0.364319,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.585419,-0.359531,-0.529225,-1.470591,0.34363,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.413565,-0.458076,-0.401093,1.457988,-0.125572,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


## Building ANN

In [18]:
from keras.models import Sequential
from keras.layers import Dense,Dropout

Using TensorFlow backend.


In [19]:
from keras.utils import to_categorical

In [20]:
Y.shape

(27006,)

In [21]:
y_train = to_categorical(Y,num_classes=5)


In [22]:
model = Sequential()

In [23]:
model.add(Dense(units=64,activation='relu',kernel_initializer='glorot_normal',input_dim=X_train.shape[1]))
model.add( Dropout( 0.2 ) )
model.add(Dense(units=32,activation='relu',kernel_initializer='glorot_normal'))

In [24]:
model.add(Dense(units=5,activation='softmax',kernel_initializer='glorot_normal'))

In [25]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [26]:
model.fit(x=X_train,y=y_train,batch_size=32,epochs=40,validation_split=0.2)

Train on 21604 samples, validate on 5402 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x7f072c335e10>

## Predicting on train data

In [27]:
test_final = pd.read_csv('/kaggle/input/data-added-on-dec25/test_data_merged_25.csv',na_values=[' '],index_col=[0])

In [28]:
principalComponents1 = pca.fit_transform(test_final[['SF1','SF2','SF3','SF4','SF5','SF6','SF7']])

In [29]:
principalDf2 = pd.DataFrame(data = principalComponents1
             , columns = ['pc1','pc2','pc3','pc4','pc5'])

In [30]:
test_final = test_final.drop(['SF1','SF2','SF3','SF4','SF5','SF6','SF7'],axis=1)


In [31]:
test_final = pd.concat([principalDf2, test_final], axis = 1)


In [32]:
clf_nm = Pipeline(steps=[('preprocessor', preprocessor)])
test_final = pd.DataFrame(clf_nm.transform(test_final).todense())

In [33]:
test_final['alpha'] = model.predict_classes(test_final)


In [34]:
test_final['alpha'].unique()

array([1, 4, 3, 2])