In [1]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
df=pd.read_excel('dataset_telegram.xlsx')
df.head(5)

Unnamed: 0,DATE,HEADLINE,TICKER,COMPANY NAME,SECTOR,SUBSECTOR,SERIES,DATE OF LISTING,PAID UP VALUE,MARKET LOT,...,close,volume,adj close,avg High,avg low,avg open,avg close,avg volume,avg adj close,trend
0,2022-02-15,Religare Ent Says\nâ–¶ï¸Religare Finvest vict...,RELIGARE,Religare Enterprises Limited,Finance - Investments,Finance - Investments,EQ,2007-11-21,10,1,...,122.050003,122.050003,910705,159.109998,159.779999,152.860001,156.13,156.13,686379.6,bearish
1,2022-02-15,Ashok Leyland targets 40% ICV sales from CNG v...,ASHOKLEY,Ashok Leyland Limited,Auto - LCVs & HCVs,Auto - LCVs & HCVs,EQ,1995-05-25,1,1,...,128.5,127.602028,15896279,159.329999,161.720001,156.199997,158.689999,157.581058,16145476.0,bearish
2,2022-02-15,Cipla promoters plan to sell 2.5% of stock,CIPLA,Cipla Limited,Pharmaceuticals,Pharmaceuticals,EQ,1995-02-08,2,1,...,921.849976,917.393127,8304774,1135.429993,1155.719995,1129.699988,1145.319995,1139.782751,3662772.4,bullish
3,2022-02-15,"2.05 cr shares of Cipla worth â‚¹1,880 cr trad...",CIPLA,Cipla Limited,Pharmaceuticals,Pharmaceuticals,EQ,1995-02-08,2,1,...,921.849976,917.393127,8304774,1135.429993,1155.719995,1129.699988,1145.319995,1139.782751,3662772.4,bullish
4,2022-02-15,NMDC in focus\n\nIron Ore falls 10% on SGX as ...,NMDC,NMDC Limited,Mining & Minerals,Mining & Minerals,EQ,2008-03-03,1,1,...,149.300003,143.697586,17659797,186.379999,189.089999,182.490005,185.719998,178.750937,18851958.2,bullish


In [3]:
df.dropna(inplace=True)
df.shape

(5386, 29)

In [4]:
data = df.iloc[:,1:6]
data.replace("[^a-zA-Z0-9\s]"," ",regex=True,inplace=True)
data.dropna(inplace=True)
data.shape

(5386, 5)

# Combine Columns

In [5]:
corpus = []
for row in range(0,len(data.index)):
    corpus.append(' '.join(str(x) for x in data.iloc[row,0:5]))
corpus

['Religare Ent Says\n      Religare Finvest victim of fraud perpetuated by erstwhile mgmt\n      Have challenged process of RFL being classified as    fraud    account \n      RBI on Feb 11 said RFL restructuring can   t be implemented with REL being a promoter as lenders declared RFL a fraud exposure RELIGARE Religare Enterprises Limited Finance   Investments Finance   Investments',
 'Ashok Leyland targets 40  ICV sales from CNG variants ASHOKLEY Ashok Leyland Limited Auto   LCVs   HCVs Auto   LCVs   HCVs',
 'Cipla promoters plan to sell 2 5  of stock CIPLA Cipla Limited Pharmaceuticals Pharmaceuticals',
 '2 05 cr shares of Cipla worth    1 880 cr trade on NSE at average    915 sh\n\nAlert  Promoters likely to have sold a stake in Cipla via block deal CIPLA Cipla Limited Pharmaceuticals Pharmaceuticals',
 'NMDC in focus\n\nIron Ore falls 10  on SGX as China fights to rein in prices NMDC NMDC Limited Mining   Minerals Mining   Minerals',
 '  Manappuram under pressure after a weak set o

In [6]:
y = pd.get_dummies(df['trend']).iloc[:,1].values
y

array([False, False,  True, ..., False, False,  True])

# Featutre Extraction using CountVectorizer

In [7]:
cv = CountVectorizer(ngram_range=(1,2))
X = cv.fit_transform(corpus)
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Train Test Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Training model MultinomialNB

In [9]:
ml = MultinomialNB().fit(X_train, y_train)
y_pred = ml.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
print("Naive Bayes Accuracy:", accuracy)

confusion_m = confusion_matrix(y_test, y_pred)
print("Naive Bayes Confusion Matrix:\n", confusion_m)

Naive Bayes Accuracy: 0.6109873793615441
Naive Bayes Confusion Matrix:
 [[427 265]
 [259 396]]


# Training Model RandomForestClassifier

In [11]:
randomclassifier = RandomForestClassifier(n_estimators=50, criterion='entropy')
rc = randomclassifier.fit(X_train, y_train)

In [12]:
y_pred = rc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Random Forest Accuracy:", accuracy)

confusion_m = confusion_matrix(y_test, y_pred)
print("Random Forest Confusion Matrix:\n", confusion_m)

Random Forest Accuracy: 0.5924276169265034
Random Forest Confusion Matrix:
 [[395 297]
 [252 403]]


# Training model Logistic Regression

In [13]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log_reg = log_reg.predict(X_test)

accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
print("Logistic Regression Accuracy:", accuracy_log_reg)

confusion_m_log_reg = confusion_matrix(y_test, y_pred_log_reg)
print("Logistic Regression Confusion Matrix:\n", confusion_m_log_reg)

Logistic Regression Accuracy: 0.5887156644394952
Logistic Regression Confusion Matrix:
 [[412 280]
 [274 381]]
