<a href="https://colab.research.google.com/github/AshiqAbdulkhader/Stock-prediction-using-knowledge-graph/blob/main/Stock_prediction_model1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy

# Data

In [2]:
stock_data = pd.read_csv('Google_dataset.csv')

In [3]:
stock_data.head()

Unnamed: 0.1,Unnamed: 0,Date,Title,Label
0,1,26/05/2006,"Stocks get a kick start, Inflation report does...",0
1,3,31/05/2006,"Google defends growth plans, Stocks end day hi...",1
2,4,01/06/2006,AMD sees bigger gains in war with Intel,1
3,6,05/06/2006,"Sexy summer picks: Google and Yahoo, Wagering ...",1
4,8,07/06/2006,"Google to sell Dell servers, Don't believe the...",1


# Preprocessing

In [7]:
punctuations = string.punctuation
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()

In [8]:
# Tokenizer function
def spacy_tokenizer(sentence):
  mytokens = parser(sentence)
  mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
  mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
  return mytokens


In [9]:
# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

In [10]:
bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1))

In [11]:
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer)

In [14]:
X = stock_data['Title']
ylabels = stock_data['Label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

# Training

In [16]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [17]:
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', classifier)])

In [18]:
pipe.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('cleaner', <__main__.predictors object at 0x7f38275c7790>),
                ('vectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 t...\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7f3828e4f560>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
             

# Metrics

In [19]:
from sklearn import metrics

In [21]:
predicted = pipe.predict(X_test)

In [22]:
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

Logistic Regression Accuracy: 0.5
Logistic Regression Precision: 0.5181058495821727
Logistic Regression Recall: 0.5923566878980892
