<a href="https://colab.research.google.com/github/DanielOnGitHub17/stock-sentiment-ai4all/blob/main/StockSentimentKaggle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd

In [27]:
df = pd.read_csv("drive/MyDrive/Codes/stock-market-data.csv", encoding="ISO-8859-1")

In [None]:
df.head()

Split dataset

In [29]:
train = df[df["Date"] < "20150101"]  # train data is before 2015
test = df[df["Date"] > "20141231"]   # Test data is after 2015

Preprocess

In [None]:
# extract only the headlines
data = train.iloc[:, 2:27]

# Remove punctuations
data.replace("[^a-zA-Z]", " ", regex=True, inplace=True)

# Rename column names for easy access. TopX -> X
list1 = [i for i in range(25)]
new_index = [str(i) for i in list1]
data.columns = new_index
data.head()

In [None]:
# convert headlines to lower case
for index in new_index:
    data[index] = data[index].str.lower()
data.head()

In [32]:
# Combine first row of data
' '.join(str(x) for x in data.iloc[0, 0:25])

'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

In [33]:
# combine headlines for all data
headlines = []
for row in range(len(data.index)):
    headlines.append(' '.join(str(x) for x in data.iloc[row, 0:25]))

In [34]:
headlines[0]

'a  hindrance to operations   extracts from the leaked reports scorecard hughes  instant hit buoys blues jack gets his skates on at ice cold alex chaos as maracana builds up for united depleted leicester prevail as elliott spoils everton s party hungry spurs sense rich pickings gunners so wide of an easy target derby raise a glass to strupar s debut double southgate strikes  leeds pay the penalty hammers hand robson a youthful lesson saints party like it s      wear wolves have turned into lambs stump mike catches testy gough s taunt langer escapes to hit     flintoff injury piles on woe for england hunters threaten jospin with new battle of the somme kohl s successor drawn into scandal the difference between men and women sara denver  nurse turned solicitor diana s landmine crusade put tories in a panic yeltsin s resignation caught opposition flat footed russian roulette sold out recovering a title'

In [35]:
# Convert all sentences into vectors using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [36]:
# Now make a BAG of WORDS

# The countvectorizer
countvector = CountVectorizer(ngram_range=(2, 2))

# transform the sentences into vectors
traindataset = countvector.fit_transform(headlines)

In [37]:
traindataset.shape[0] == len(train)

True

In [38]:
# Code an RFC

# The classifier
randomclassifier = RandomForestClassifier(n_estimators=200, criterion="entropy")

# Classify the vectorized dataset with the Label they correspond to in train dataframe
randomclassifier.fit(traindataset, train["Label"])


In [39]:
# Save random classifier model to a file
import pickle
with open("stock-sentiment-predict.pkl", "wb") as rfc_file:
    pickle.dump(randomclassifier, rfc_file)

In [40]:
# Save the random classifier model to a file using joblib
import joblib
joblib.dump(randomclassifier, "stock-sentiment-predict.joblib")

['stock-sentiment-predict.joblib']

In [None]:
# Predict for the Test Dataset
test_transform = []
for row in range(len(test.index)):
    test_transform.append(' '.join(str(x) for x in test.iloc[row, 2:27]))

print(test_transform)

In [None]:
# Remove punctuations and make lowercase in test_dataset;
# Test dataset python list is an array not an dataframe
# remove punctuations
# use re module
import re
# remove punctuations
test_datad = [re.sub(r"[^a-zA-Z]", " ", x).lower() for x in test_transform]

test_datad

In [43]:
# Vectorize the joined test dataset
test_dataset = countvector.transform(test_transform)

In [44]:
# Predict
predictions = randomclassifier.predict(test_dataset)

In [45]:
predictions

array([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [46]:
# Import library to check accuracy
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [47]:
# Get confusion matrix
matrix = confusion_matrix(test["Label"], predictions)
print(matrix)

[[138  48]
 [  7 185]]


In [48]:
# Get accuracy score
score = accuracy_score(test["Label"], predictions)
print(score)

0.8544973544973545


In [49]:
# Generate model report
report = classification_report(test["Label"], predictions)
print(report)

              precision    recall  f1-score   support

           0       0.95      0.74      0.83       186
           1       0.79      0.96      0.87       192

    accuracy                           0.85       378
   macro avg       0.87      0.85      0.85       378
weighted avg       0.87      0.85      0.85       378



Check the meanings of the above - from the Week 10 lecture recording and slides

In [50]:
print(":)")

:)


In [51]:
# joblib the count vector
joblib.dump(countvector, "stock-sentiment-countvector.joblib")

['stock-sentiment-countvector.joblib']

In [52]:
ss = countvector.transform(['walmart warns of higher prices if trump implements proposed tariffs vw warning strikes to start in december as talks continue why the sp  may be inflated by  microstrategys stock turns south to snap long streak of doubledigit gains uk sanctions angolas dos santos in dirty money crackdown china funds slash etf fees escalating price war in booming market china is behind proxy wars and setting the world on fire gordon chang us weekly jobless claims unexpectedly fall citadels ken griffin says trumps tariffs could lead to crony capitalism viral banana artwork ducttaped to wall sells for m charles payne trump instantly had a positive impact on housing why it pays to focus now on nvidia treasury bonds and a bullish finish to  stock market today dow rises alphabet selloff weighs on nasdaq investors should be looking entirely at trump trade expert oleary details why the us needs a crypto czar weve got to stop these shenanigans jpmorgan asset management sees stock market rolling through  these stocks could bask in the glicked glow as gladiator ii and wicked open top  etfs tied to one stock nvidia cftc advisers recommend use of tokenized assets as collateral truth social owner may develop crypto platform called truthfi northvolt europes hope for a battery champion files for bankruptcy bitcoin is digital gold should be a us reserve asset sen cynthia lummis bribery charges against gautam adani strike at heart of modis india flash drought gripping us threatens to raise food prices in  crytpo has become a powerful political force hal lambert'])

In [58]:
randomclassifier.predict(ss)[0]

1