In [1]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

    



In [4]:
file_path = r'C:\sentiment_analysis\data\clean_data.csv'

df = pd.read_csv(file_path)
df = df.dropna(subset=['cleaned_text'])

In [5]:
X = df['cleaned_text']  # Transform the cleaned text to a TF-IDF representation
y = df['sentiment']  # Replace 'label' with the actual name of your target column

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [6]:
vectorizer = TfidfVectorizer(max_features=5000)
X_trainVec = vectorizer.fit_transform(X_train)
X_testVec = vectorizer.transform(X_test)

In [7]:

# Model Training
model = LogisticRegression()  # Initialize the Logistic Regression model
model.fit(X_trainVec, y_train)  # Fit the model to the training data

# Model Evaluation
y_pred = model.predict(X_testVec)  # Make predictions on the test set
print("Classification Report:")
print(classification_report(y_test, y_pred))  # Print evaluation metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))  # Print confusion matrix


Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.75      0.77    149085
           4       0.76      0.79      0.77    146730

    accuracy                           0.77    295815
   macro avg       0.77      0.77      0.77    295815
weighted avg       0.77      0.77      0.77    295815

Confusion Matrix:
[[112056  37029]
 [ 31176 115554]]


In [8]:
joblib.dump(model, r"C:\sentiment_analysis\models\sentiment_model.pkl")
joblib.dump(vectorizer, r"C:\sentiment_analysis\models\vectorizer.pkl")


['C:\\sentiment_analysis\\models\\vectorizer.pkl']

In [9]:

print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.7694335987018914
