In [1]:
import pandas as pd
import numpy as np  

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier




%matplotlib inline

ModuleNotFoundError: No module named 'vaderSentiment'

In [None]:
tweets = pd.read_csv("trumptweet.csv")
tweets.head()

In [None]:
for col in tweets.columns: 
    print(col)

In [None]:
tweets.dropna(inplace = True) 
np.array([len(tweets.text) for tweet in tweets])

In [None]:
tweets["Text_Length"]= tweets["text"].str.len() 
tweets.head()


Above, we have added code to figure out the length of each tweet in tweets

There are outliers present; We have to remove the outliers. 

In [None]:
from scipy import stats
tweets=tweets[np.abs(tweets.Text_Length-tweets.Text_Length.mean()) <= (3*tweets.Text_Length.std())]
tweets.reset_index(inplace = True) 
tweets.tail()

In [None]:
Tweet_lengths = pd.Series(data=tweets['Text_Length'].values, index=tweets['created_at'])
Tweet_lengths.describe()

In [None]:
Tweet_lengths.plot(figsize=(16,4), color='r')
plt.xlabel('Date of Tweet')
plt.ylabel('Length of Tweet')
plt.title("Length of Tweets by Date")
plt.show()

In [None]:
sources = []
for source in tweets['source']:
    if source not in sources:
        sources.append(source)
        
print("Content sources:")
for source in sources:
    print("* {}".format(source))

In [None]:
percent = np.zeros(len(sources))

for source in tweets['source']:
    for index in range(len(sources)):
        if source == sources[index]:
            percent[index] += 1
            pass

percent /= 100

# Pie chart:
pie_chart = pd.Series(percent, index=sources, name='Sources')
pie_chart.plot.pie(fontsize=11, autopct='%.4f', figsize=(6, 6));

In [None]:
tweets['created_at']= pd.to_datetime(tweets['created_at']) 
tweets.info()

In [None]:
from textblob import TextBlob
import re

def clean_tweet(tweet):
    '''
    Use re to remove special characers and keep only the necessary characters. 
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def analize_sentiment(tweet):
    '''
    Classify whether a tweet is positive or negative using polarity for cleaned tweets
    '''
    analysis = TextBlob(clean_tweet(tweet))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [None]:
tweets['cleaned'] = np.array([ clean_tweet(tweet) for tweet in tweets['text'] ])

In [None]:
tweets['sentiment'] = np.array([ analize_sentiment(tweet) for tweet in tweets['text'] ])


In [None]:
tweets.tail()

In [None]:
positive = [ tweet for index, tweet in enumerate(tweets['text']) if tweets['sentiment'][index] > 0]
neutral = [ tweet for index, tweet in enumerate(tweets['text']) if tweets['sentiment'][index] == 0]
negative = [ tweet for index, tweet in enumerate(tweets['text']) if tweets['sentiment'][index] < 0]

In [None]:
print("Positive tweets: {}%".format(len(positive)*100/len(tweets['text'])))
print("Neutral tweets: {}%".format(len(neutral)*100/len(tweets['text'])))
print("Negative tweets: {}%".format(len(negative)*100/len(tweets['text'])))

In [None]:
tweets['date'] = tweets['created_at'].dt.date
tweets["Date"] = tweets['created_at'].dt.date

tweets = tweets.groupby(by='date').count()

In [None]:
tweets['count']= tweets['created_at']
mean_tweets = tweets['count'].mean()
print(mean_tweets)


## After a small sneak peak, we shall begin the bulk of our analysis
Let us start by changing the way we perform a sentiment analysis while also utilizing more data from Trump's Tweets

In [None]:
tweets = pd.read_csv("Data/trumptweets.csv", header=0,encoding = 'unicode_escape') #Updated Tweet data
tweets.head()

In [None]:
stock1 = pd.read_csv('Data/historical.csv')
stock2 = pd.read_csv('Data/historical2.csv')
stock3 = pd.read_csv('Data/historical3.csv')
stock4 = pd.read_csv('Data/historical4.csv')
stock5 = pd.read_csv('Data/historical5.csv')
stock6 = pd.read_csv('Data/historical6.csv')
stock_df = stock1.append([stock2,stock3,stock4,stock5,stock6])
stock_df.head()

In [None]:
trim_df = stock_df[['Date', 'Open']]
trim_df['pure_date'] = trim_df.Date

In [None]:
change = []
for item in stock_df.Change:
    if item > 0:
        change.append(1)
    else:
        change.append(-1)
trim_df['Change'] = change
trim_df

In [None]:
stock_df.plot('Date','Open')


In [None]:
sns.set_style('darkgrid')
ax = sns.lineplot(x=trim_df.pure_date,y=trim_df.Open, linewidth=0.2,color='black')
ax.set(title='S&P 500 Prices',
       xlabel='Date for S&P 500',
       xticks=['02/27/14', '02/14/15', '02/03/16', '01/22/17', '01/09/18'],
      ylabel='Price')
plt.show()

In [None]:
sentences = tweets['text'].tolist()

In [None]:
analyzer = SentimentIntensityAnalyzer()
for sentence in sentences:
    vs = analyzer.polarity_scores(sentence)

Here we have to use a sentiment classifier

tweets['sentiment'] = vs['compound'] 

In [None]:
tweets[['sentiment', 'subjectivity']] = tweets['text'].apply(lambda text: pd.Series(TextBlob(text).sentiment))


In [None]:
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()
# tweets["sentimentscore"] = tweets.apply(lambda row: sid.polarity_scores(row['text'])['compound'] , axis=1)

In [None]:
tweets.head()

In [None]:
tweets['date']= pd.to_datetime(tweets['created_at']) 

In [None]:
tweets['just_date'] = tweets['date'].dt.date

In [None]:
tweets.head()

In [None]:
trim_df['date']= pd.to_datetime(trim_df['Date']) 
trim_df['just_date'] = trim_df['date'].dt.date

In [None]:
trim_df.head()

In [None]:
combined= pd.merge(trim_df, tweets, on='just_date')

In [None]:
combined.head()

In [None]:
combined.describe()

#### This chunk of code below is optional, but it is so we can analyze the stock price the day after the tweet was sent.
For our purpose, we shall observe changes/fluctuations on the same day

In [None]:

# pure_date = []           
# for item in tweets.date:
#     try:
#         pure_date.append(datetime.date(year=item.year, month=item.month, day=item.day+1))
#     except:
#         pure_date.append(datetime.date(year=item.year, month=item.month+1, day=1))
#     else:
#         pass

In [None]:
df = combined.drop(['Date','Open','pure_date', 'date_x', 'text' , "created_at", 'date_y', 'just_date'], axis=1)
df.head(10)

## Now we can Start the modelling aspect

We shall use the processed dataframe from above to model several aspects

In [None]:
X = df.drop(['Change'], axis=1)
y = df['Change']

In [None]:
X.head()

In [None]:
y.head(15)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25, shuffle=False)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
model_set_1 = [LogisticRegression()]
grid = {'C':np.logspace(-3,2,10)}
for model in model_set_1:
    gscv = GridSearchCV(model,param_grid=grid,n_jobs=-1,cv=10)
    gscv.fit(X_train,y_train)
    print(gscv.best_estimator_)
    print(gscv.best_score_)



In [None]:
model_set_2 = KNeighborsClassifier()
grid={'n_neighbors' : range(2,50,1)}
gscv2 = GridSearchCV(model_set_2,param_grid=grid,n_jobs=-1,cv=10)
gscv2.fit(X_train,y_train)
print(gscv2.best_estimator_)
print(gscv2.best_score_)

In [None]:
model_set_3 = [DecisionTreeClassifier()]
grid = {}
for model in model_set_3:
    gscv3 = GridSearchCV(model,param_grid=grid,n_jobs=-1,cv=10)
    gscv3.fit(X_train,y_train)
    print(gscv3.best_estimator_)
    print(gscv3.best_score_)

### Explained
From the above cross validation tests and scores, we can clearly see that logistic regression performs the best, followed by the decision tree classifier, then followed by the KNN classifier. 

Logistic Regression with a 10 fold cross validation gives us a score of 0.5269324938789787

DecisionTreeClassifier with cv of 10 gives us a score of 0.4863588667366212, less than a coin toss. 

KNeighborsClassifier with a cv of 10 gives us a score of  0.48382301504022385 which is only very slightly under the Decision tree classifier. This is a little less than a coin toss. 

### Test set evaluation of Trump on S&P 500 Market Data

First, we shall test the simple Logistic Regression classifier on our test dataset. 
From our previous iterations on test set, we saw that Logistic Regression performed the best. 

In [None]:
top = 0
for i in np.logspace(-3,2,10):
    logreg = LogisticRegression(C=i)
    logreg.fit(X_train,y_train)
    if logreg.score(X_test,y_test)>top:
        top = logreg.score(X_test,y_test)
        top_i = i
print('LOGREG - C=', top_i)
print('Accuracy:',top)
print()

In [None]:
top=0
for i in range(2,50):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train,y_train)
    if knn.score(X_test,y_test) > top:
        top=knn.score(X_test,y_test)
        top_i = i
print(f'KNN with {top_i} Neighbors - Accuracy:', top)
print()


In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train,y_train)
print('Decision Tree Accuracy:', dt.score(X_test,y_test))
print()

Clearly, we can see that the logistic regression function and model performed the best and is more than 10% accurate in comparison to a coin toss. 

### Further exploration
On Kaggle and as described in lecture, we can see that the XGBoost model classifier is continually winning competitions and seems to be a good fit. 

Let us see if the algorithm can help us predict our data any better. 

In [None]:
import xgboost as xgb
from sklearn.metrics import confusion_matrix

kf = KFold(n_splits=5, shuffle=False)
mat_list = []
for train_index, test_index in kf.split(X):
    X_train1 = X.iloc[train_index]
    y_train1 = y[train_index]
    xgb_model = xgb.XGBClassifier().fit(X_train1,y_train1)
    predictions = xgb_model.predict(X.iloc[test_index])
    actuals = y[test_index]
    mat_list.append(confusion_matrix(actuals, predictions))
matrix = mat_list[0]
for n_mat in range(1,5):
    for i in range(2):
        for j in range(2):
            matrix[i][j] += mat_list[n_mat][i][j]
true = matrix[0][0] + matrix[1][1]
false = matrix[1][0] + matrix[0][1]
print('XGB performance', true/(true+false))

In [None]:
xgb_model = xgb.XGBClassifier().fit(X_train,y_train)
predictions = xgb_model.predict(X_test)
actuals = y_test


In [None]:
print('XGB Accuracy', xgb_model.score(X_test,y_test))


In [None]:
print(confusion_matrix(actuals, predictions))

### Fin