In [3]:
# install dependencies (quietly)
!pip install scikit-learn pandas --quiet

# 1. Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 2. Load dataset
df = pd.read_csv('/content/telecom_comments_sentiment.csv')

# 🔹 Handle missing text values (important to avoid ValueError)
df['clean_text'] = df['clean_text'].fillna('')   # replace NaN with empty string

# 3. Separate features and labels
X = df['clean_text']                # text column
y = df['sentiment']                 # positive / neutral / negative

# 4. Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# 6. Train logistic regression classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)

# 7. Evaluate on test set
y_pred = clf.predict(X_test_vec)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

# 8. Save model and vectorizer (optional)
joblib.dump(clf, '/content/baseline_model.pkl')
joblib.dump(vectorizer, '/content/tfidf_vectorizer.pkl')
print("\nModel and vectorizer saved.")



Classification Report:

              precision    recall  f1-score   support

    negative       0.75      0.12      0.21        25
     neutral       0.70      0.79      0.74        70
    positive       0.72      0.83      0.77        76

    accuracy                           0.71       171
   macro avg       0.72      0.58      0.57       171
weighted avg       0.71      0.71      0.67       171


Confusion Matrix:

[[ 3 11 11]
 [ 1 55 14]
 [ 0 13 63]]

Model and vectorizer saved.


# Install Needed Libraries

In [4]:
!pip install imbalanced-learn --quiet


# Oversample Negatives

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# 1. TF-IDF + Logistic Regression pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

# 2. Convert text to vectors manually before oversampling:
X_train_text = X_train  # your training texts
y_train_labels = y_train

# We'll fit vectorizer separately to get numeric features
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train_text)

# 3. Oversample minority class
ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train_vec, y_train_labels)

# 4. Train logistic regression on balanced data
clf = LogisticRegression(max_iter=1000)
clf.fit(X_resampled, y_resampled)

# 5. Evaluate on test data
X_test_vec = vectorizer.transform(X_test)
y_pred = clf.predict(X_test_vec)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.50      0.52      0.51        25
     neutral       0.73      0.76      0.74        70
    positive       0.78      0.74      0.76        76

    accuracy                           0.71       171
   macro avg       0.67      0.67      0.67       171
weighted avg       0.72      0.71      0.71       171

[[13  6  6]
 [ 7 53 10]
 [ 6 14 56]]


# Impoved

In [8]:
import os
import joblib

# 🔹 Make sure the directory exists
os.makedirs('/content/models', exist_ok=True)

# 🔹 Save your objects
joblib.dump(clf, '/content/models/model_balanced.pkl')
joblib.dump(vectorizer, '/content/models/vectorizer_balanced.pkl')

print("✅ Model and vectorizer saved successfully.")


✅ Model and vectorizer saved successfully.
