In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [4]:
headlines = pd.read_csv('redditData/reddit_headlines_labels.csv', index_col=None).to_numpy()

In [5]:
headlines_text = [i[0] for i in headlines]
sentiments = [i[1] for i in headlines]

In [6]:
# Convert sentiments to numpy array
sentiments = np.array(sentiments)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(headlines_text, sentiments, test_size=0.2, random_state=42)

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the testing data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=10000)
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print classification report for more details
print(classification_report(y_test, y_pred))


Accuracy: 0.9169078131459281
              precision    recall  f1-score   support

          -1       0.93      0.91      0.92       848
           0       0.89      0.95      0.92      1031
           1       0.96      0.87      0.91       540

    accuracy                           0.92      2419
   macro avg       0.93      0.91      0.92      2419
weighted avg       0.92      0.92      0.92      2419



In [7]:
# import joblib
# joblib.dump(model, 'models/logistic_regression_model.pkl')

['models/logistic_regression_model.pkl']

In [8]:
import random
k=random.randint(0,len(X_test)-1)
print('Predicted LR:', model.predict(X_test_tfidf[k]) )
print('Real:', y_test[k] )
print(X_test_tfidf[k])

Predicted LR: ['0']
Real: 0
  (0, 4948)	0.2283033265272085
  (0, 4840)	0.19087942850680567
  (0, 4605)	0.22082686847204147
  (0, 4488)	0.07639499253327982
  (0, 4421)	0.2062852784286666
  (0, 4419)	0.27461268921508636
  (0, 3367)	0.23974973241600633
  (0, 3006)	0.26223609168772066
  (0, 2817)	0.22037648350223843
  (0, 1879)	0.24797393194516415
  (0, 1807)	0.17291108813247658
  (0, 1565)	0.2806012339201133
  (0, 1341)	0.28513676133208016
  (0, 632)	0.15555377797548828
  (0, 603)	0.29956526032223874
  (0, 473)	0.3072925823001021
  (0, 297)	0.32290602019607995


In [12]:
headline = "New filings reveal more luxury trips and opaque payments to Clarence Thomas’ wife, Ginni"
#Ensure the input data is in the correct shape (2D array)
headline = tfidf_vectorizer.transform(np.array([headline]))
# Perform prediction using your model
prediction = model.predict(headline)
print(prediction[0])

['0']


In [10]:
# joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.pkl')

['models/tfidf_vectorizer.pkl']