In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from nltk.sentiment import SentimentIntensityAnalyzer
import praw
import time
import os

In [2]:
reddit_id = os.getenv('REDDIT_CLIENT_ID')
reddit_secret = os.getenv('REDDIT_CLIENT_SECRET')
reddit_agent = os.getenv('REDDIT_USER_AGENT')

In [4]:
sia = SentimentIntensityAnalyzer()

In [3]:
reddit = praw.Reddit(client_id=reddit_id, client_secret=reddit_agent, user_agent=reddit_agent)

In [5]:
def get_sentiment_score(ticker):
    subreddit = reddit.subreddit("stocks")
    mentions = subreddit.search(ticker, limit=1)
    sentiment_scores = [sia.polarity_scores(post.title)['compound'] for post in mentions]
    time.sleep(1)
    return np.mean(sentiment_scores) if sentiment_scores else 0

In [6]:
def get_simple_sentiment_score(ticker):
    subreddit = reddit.subreddit("stocks")
    posts = list(subreddit.hot(limit=20))  # Just top 100 posts
    mentions = [post.title for post in posts if ticker in post.title]

    if not mentions:
        return 0  # No mention, neutral

    sentiment_scores = [sia.polarity_scores(title)['compound'] for title in mentions]
    return np.mean(sentiment_scores)


In [7]:
#df = pd.DataFrame(stock_data)
df = pd.read_csv('constituents-financials.csv')

In [8]:
df = df.drop(labels=['SEC Filings','Sector','52 Week High','52 Week Low'],axis=1)

In [9]:
df = df.dropna()

In [None]:
#df['Sentiment'] = df['Symbol'].apply(get_simple_sentiment_score)

In [10]:
df.value_counts()

Symbol  Name                      Price   Price/Earnings  Dividend Yield  Earnings/Share  Market Cap    EBITDA        Price/Sales  Price/Book
AAPL    Apple Inc.                236.00  37.760000       0.0042          6.25            3.572851e+12  1.346610e+11  9.136909     62.649323     1
MU      Micron Technology         91.24   26.523254       0.0044          3.44            1.030611e+11  8.944000e+09  4.104220     2.242044      1
NSC     Norfolk Southern Railway  255.30  22.065687       0.0214          11.57           5.777975e+10  5.888000e+09  4.746940     4.189985      1
NRG     NRG Energy                102.44  25.356436       0.0168          4.04            2.075086e+10  2.193000e+09  0.737992     11.226301     1
NOC     Northrop Grumman          487.27  17.199787       0.0171          28.33           7.063076e+10  4.458000e+09  1.723332     4.817635      1
                                                                                                                           

In [11]:
df.head(10)

Unnamed: 0,Symbol,Name,Price,Price/Earnings,Dividend Yield,Earnings/Share,Market Cap,EBITDA,Price/Sales,Price/Book
0,MMM,3M,152.2,21.286713,0.0199,7.15,83294180000.0,8117000000.0,2.552062,17.855467
1,AOS,A. O. Smith,67.3,18.539946,0.0196,3.63,9758165000.0,809100000.0,2.506078,5.091157
2,ABT,Abbott Laboratories,127.93,16.744764,0.0202,7.64,221889500000.0,10825000000.0,5.383446,5.575749
3,ABBV,AbbVie,183.9,64.3007,0.0373,2.86,324977000000.0,25630000000.0,5.851963,53.88221
4,ACN,Accenture,384.95,32.294464,0.017,11.92,240778500000.0,11065910000.0,3.710195,8.508687
7,AES,AES Corporation,11.0,7.638888,0.0615,1.44,7821297000.0,3334000000.0,0.636706,2.37735
8,AFL,Aflac,107.38,15.955423,0.0214,6.73,59652600000.0,4779250000.0,3.447728,2.407569
10,APD,Air Products,335.26,19.469223,0.0214,17.22,74554790000.0,4296600000.0,6.161247,4.380537
14,ARE,Alexandria Real Estate Equities,97.35,54.083332,0.052,1.8,16850510000.0,1905124000.0,5.461276,0.920428
16,ALLE,Allegion,132.73,20.42,0.0144,6.5,11538120000.0,888400000.0,3.098315,7.344104


In [12]:
df['good_pe'] = df['Price/Earnings'].between(0, 25)  # PE ratio between 0 and 25 is "good"
df['good_eps'] = df['Earnings/Share'] >= 0                 # EPS greater than or equal to 0 is "good"
#df['good_roe'] = df['roe'].between(0.15, 0.20)   # ROE between 0.10 and 0.25 is "good"
#df['good_debt_equity'] = df['debt_to_equity'].between(1.5,2) 
df['good_mc'] = df['Market Cap'] >= 2e9         # Market Cap greater than or equal to 1 billion is "good"
df['good_pb'] = df['Price/Book'].between(0, 3)
#df['good_current'] = df['current_ratio'].between(1.5,3)
#df['good_sentiment'] = df['sentiment'] > 0 
df['good_div'] = df['Dividend Yield'].between(0.02, 0.06)

In [14]:
df['good_stock'] = df[['good_pe', 'good_eps', 'good_mc', 'good_pb', 'good_div']].all(axis=1).astype(int)

In [15]:
df

Unnamed: 0,Symbol,Name,Price,Price/Earnings,Dividend Yield,Earnings/Share,Market Cap,EBITDA,Price/Sales,Price/Book,good_pe,good_eps,good_mc,good_pb,good_div,good_stock
0,MMM,3M,152.20,21.286713,0.0199,7.15,8.329418e+10,8.117000e+09,2.552062,17.855467,True,True,True,False,False,0
1,AOS,A. O. Smith,67.30,18.539946,0.0196,3.63,9.758165e+09,8.091000e+08,2.506078,5.091157,True,True,True,False,False,0
2,ABT,Abbott Laboratories,127.93,16.744764,0.0202,7.64,2.218895e+11,1.082500e+10,5.383446,5.575749,True,True,True,False,True,0
3,ABBV,AbbVie,183.90,64.300700,0.0373,2.86,3.249770e+11,2.563000e+10,5.851963,53.882210,False,True,True,False,True,0
4,ACN,Accenture,384.95,32.294464,0.0170,11.92,2.407785e+11,1.106591e+10,3.710195,8.508687,False,True,True,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,WMB,Williams Companies,55.43,23.487290,0.0353,2.36,6.756973e+10,5.864000e+09,6.518399,5.451952,True,True,True,False,True,0
497,XEL,Xcel Energy,67.20,19.940653,0.0328,3.37,3.858906e+10,5.510000e+09,2.803826,1.993592,True,True,True,True,True,1
498,XYL,Xylem Inc.,124.04,35.643677,0.0118,3.48,3.013490e+10,1.689000e+09,3.577267,2.845411,False,True,True,True,False,0
501,ZBH,Zimmer Biomet,109.48,20.893131,0.0087,5.24,2.179462e+10,2.557400e+09,2.869412,1.765892,True,True,True,True,False,0


In [16]:
scaler = StandardScaler()

In [17]:
features = ['Price/Earnings','Earnings/Share','Market Cap','Price/Book','Dividend Yield']
X = df[features]

In [18]:
X_scaled = scaler.fit_transform(X)

In [19]:
clf = LogisticRegression()

# Train the model
clf.fit(X_scaled, df['good_stock'])

# Predict the "good stock" for each stock in the dataset
df['predicted_good_stock'] = clf.predict(X_scaled)

In [20]:
#see output
df

Unnamed: 0,Symbol,Name,Price,Price/Earnings,Dividend Yield,Earnings/Share,Market Cap,EBITDA,Price/Sales,Price/Book,good_pe,good_eps,good_mc,good_pb,good_div,good_stock,predicted_good_stock
0,MMM,3M,152.20,21.286713,0.0199,7.15,8.329418e+10,8.117000e+09,2.552062,17.855467,True,True,True,False,False,0,0
1,AOS,A. O. Smith,67.30,18.539946,0.0196,3.63,9.758165e+09,8.091000e+08,2.506078,5.091157,True,True,True,False,False,0,0
2,ABT,Abbott Laboratories,127.93,16.744764,0.0202,7.64,2.218895e+11,1.082500e+10,5.383446,5.575749,True,True,True,False,True,0,0
3,ABBV,AbbVie,183.90,64.300700,0.0373,2.86,3.249770e+11,2.563000e+10,5.851963,53.882210,False,True,True,False,True,0,0
4,ACN,Accenture,384.95,32.294464,0.0170,11.92,2.407785e+11,1.106591e+10,3.710195,8.508687,False,True,True,False,False,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,WMB,Williams Companies,55.43,23.487290,0.0353,2.36,6.756973e+10,5.864000e+09,6.518399,5.451952,True,True,True,False,True,0,0
497,XEL,Xcel Energy,67.20,19.940653,0.0328,3.37,3.858906e+10,5.510000e+09,2.803826,1.993592,True,True,True,True,True,1,0
498,XYL,Xylem Inc.,124.04,35.643677,0.0118,3.48,3.013490e+10,1.689000e+09,3.577267,2.845411,False,True,True,True,False,0,0
501,ZBH,Zimmer Biomet,109.48,20.893131,0.0087,5.24,2.179462e+10,2.557400e+09,2.869412,1.765892,True,True,True,True,False,0,0


In [21]:
#check the model's metrics
print("Accuracy",accuracy_score(df['good_stock'],df['predicted_good_stock']))
print("Recall:",recall_score(df['good_stock'],df['predicted_good_stock']))

Accuracy 0.8372781065088757
Recall: 0.3382352941176471


In [22]:
joblib.dump(clf, 'logistic_model.pkl')

['logistic_model.pkl']

In [23]:
df.to_csv('predicted_stocks.csv',index=False)