In [2]:
import pandas as pd

# Load the data with sentiment labels
df = pd.read_csv('../data/news_with_sentiment.csv')

# Check the dataset
print(df[['joined_tokens', 'sentiment']].head())


                                       joined_tokens sentiment
0  dividend stocksplit idfc arc finance vedanta o...   neutral
1  hyundai motor india r crore ipo open th oct pr...   neutral
2  tiger broker yang xu share insight bitcoin etf...  positive
3            top german dividend stock watch october  positive
4  oil price steady sliding potential israelhezbo...   neutral


In [3]:
from sklearn.model_selection import train_test_split

# Features (text data) and labels (sentiment)
X = df['joined_tokens']
y = df['sentiment']

# Train-test split (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")


Training set size: 79, Test set size: 20


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer (limiting to the top 500 features for simplicity)
vectorizer = TfidfVectorizer(max_features=500)

# Fit and transform the training data, and transform the test data
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Print the size of the transformed feature sets
print(X_train_tfidf.shape, X_test_tfidf.shape)


(79, 418) (20, 418)


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
model = LogisticRegression()

# Train the model using the training data
model.fit(X_train_tfidf, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test_tfidf)

# Evaluate the model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.5
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         4
     neutral       0.36      0.83      0.50         6
    positive       0.83      0.50      0.62        10

    accuracy                           0.50        20
   macro avg       0.40      0.44      0.38        20
weighted avg       0.52      0.50      0.46        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:
import joblib

# Save the model
joblib.dump(model, 'sentiment_model.pkl')
# Save the vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']