In [114]:
import re
import warnings
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    f1_score,
    recall_score,
    precision_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
)
warnings.filterwarnings("ignore")
pd.set_option('display.max_colwidth', None)

In [115]:
df = pd.read_csv('/kaggle/input/customer-support-on-twitter/twcs/twcs.csv')
df.sample()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
1016713,1125946,Safaricom_Care,False,Sat Oct 14 14:33:31 +0000 2017,@385483 We will respond via DM shortly.^WO,,1125947.0


In [116]:
df['text'] = df['text'].fillna('').astype(str)
tmobile_mentions = df[df['text'].str.match(r'^\s*@TMobileHelp', case=False)]
tmobile_mentions['text'].sample()

268914    @TMobileHelp Why does it repeatedly emphasize the req’t that you need a TMobile ONE plan https://t.co/sTDx8CS3FA
Name: text, dtype: object

In [117]:
def preprocess(text):
    if pd.isnull(text):
        return ""
    text = text.lower()
    text = re.sub(r"http\S+", "", text)                 
    text = re.sub(r"@tmobilehelp", "", text)            
    text = re.sub(r"tmobilehelp", "", text)              
    text = re.sub(r"[^a-zA-Z0-9\s!?']", "", text)         
    text = re.sub(r"\s+", " ", text).strip()             
    return text

tmobile_mentions['text'] = tmobile_mentions['text'].apply(preprocess)

In [118]:
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    if pd.isnull(text) or not isinstance(text, str) or text.strip() == "":
        return 0.0
    return sid.polarity_scores(text)['compound']

tmobile_mentions['sentiment_score'] = tmobile_mentions['text'].apply(get_sentiment_score)

tmobile_mentions['churn_label'] = tmobile_mentions['sentiment_score'].apply(lambda x: 1 if x < 0.0 else 0)

In [119]:
def custom_churn_rule(text, score):
    churn_keywords = ["cancel", "quit", "leave", "not worth", "switch", "no longer", "done", "won’t come back", "don’t want me back"]
    if score < -0.05:
        return 1
    elif any(kw in text.lower() for kw in churn_keywords):
        return 1
    return 0

tmobile_mentions['churn_label'] = tmobile_mentions.apply(lambda row: custom_churn_rule(row['text'], row['sentiment_score']), axis=1)

In [120]:
tmobile_mentions = tmobile_mentions[['text', 'sentiment_score', 'churn_label']]
tmobile_mentions = tmobile_mentions[tmobile_mentions['text'].str.strip() != '']

In [121]:
tmobile_mentions.sample()

Unnamed: 0,text,sentiment_score,churn_label
1050149,i see how this jump program works its way for more money and i cant use at all if i dont want to keep my device,-0.0572,1


In [122]:
tmobile_mentions['churn_label'].value_counts()

churn_label
0    12388
1     5162
Name: count, dtype: int64

In [123]:
X_train, X_test, y_train, y_test = train_test_split(
    tmobile_mentions['text'], tmobile_mentions['churn_label'], test_size=0.1, random_state=42, stratify=tmobile_mentions['churn_label']
)


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=11000 , stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
    
pipeline.fit(X_train, y_train)
    
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)

acc_train = accuracy_score(y_train, y_train_pred)
acc_test = accuracy_score(y_test, y_test_pred)

print(f"Train Acc: {acc_train:.4f} | Test Acc: {acc_test:.4f}")

Train Acc: 0.8736 | Test Acc: 0.8011


In [124]:
print(classification_report(y_test, y_test_pred, digits=4))
print(confusion_matrix(y_test, y_test_pred))

              precision    recall  f1-score   support

           0     0.8890    0.8208    0.8535      1239
           1     0.6367    0.7539    0.6903       516

    accuracy                         0.8011      1755
   macro avg     0.7628    0.7873    0.7719      1755
weighted avg     0.8148    0.8011    0.8056      1755

[[1017  222]
 [ 127  389]]


In [125]:
param_grid = {
    'clf__C': [0.01, 0.1, 1, 10],
    'clf__penalty': ['l2'],
    'clf__solver': ['liblinear', 'lbfgs'],
    'clf__class_weight': [None, 'balanced'],
    'clf__max_iter': [500, 1000]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1', verbose=2, n_jobs=-1)
grid.fit(tmobile_mentions['text'], tmobile_mentions['churn_label'])

print("Best parameters:", grid.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits
Best parameters: {'clf__C': 10, 'clf__class_weight': 'balanced', 'clf__max_iter': 500, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}


In [126]:
best_model = grid.best_estimator_

y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

print("Train Accuracy: ", accuracy_score(y_train, y_train_pred))
print("Test Accuracy:  ", accuracy_score(y_test, y_test_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_test_pred))
print(confusion_matrix(y_test, y_test_pred))

Train Accuracy:  0.9248496359607471
Test Accuracy:   0.9225071225071225

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.93      0.94      1239
           1       0.84      0.91      0.87       516

    accuracy                           0.92      1755
   macro avg       0.90      0.92      0.91      1755
weighted avg       0.93      0.92      0.92      1755

[[1151   88]
 [  48  468]]


In [131]:
unseen_texts = [
    "Too many hidden fees. I feel misled.",
    "I've had enough. Canceling my plan today.",
    "Great service every time I call, thank you!",
    "You keep charging me extra for no reason.",
    "Love the new upgrade! Totally worth it.",
    "Still no resolution after 3 weeks of complaints.",
    "I appreciate the quick fix on my billing issue.",
    "My calls keep dropping. I'm switching to another carrier.",
    "This is the best customer service I’ve ever had.",
    "I waited an hour and no one picked up. Terrible experience.",
    "Thanks for helping me with my SIM issue!",
    "Why is my data speed so slow these days?",
    "Absolutely painless porting process. Loved it!",
    "I’m done being ignored. Time to change provider.",
    "No problems at all. Everything just works.",
]

predictions = best_model.predict(unseen_texts)
probs = best_model.predict_proba(unseen_texts)

for text, label, prob in zip(unseen_texts, predictions, probs):
    print(f"Text: {text}\n→ Predicted Churn: {label} (Confidence: {max(prob):.2f})")

Text: Too many hidden fees. I feel misled.
→ Predicted Churn: 1 (Confidence: 0.89)
Text: I've had enough. Canceling my plan today.
→ Predicted Churn: 1 (Confidence: 0.95)
Text: Great service every time I call, thank you!
→ Predicted Churn: 0 (Confidence: 0.97)
Text: You keep charging me extra for no reason.
→ Predicted Churn: 0 (Confidence: 0.56)
Text: Love the new upgrade! Totally worth it.
→ Predicted Churn: 0 (Confidence: 0.85)
Text: Still no resolution after 3 weeks of complaints.
→ Predicted Churn: 1 (Confidence: 0.97)
Text: I appreciate the quick fix on my billing issue.
→ Predicted Churn: 0 (Confidence: 0.96)
Text: My calls keep dropping. I'm switching to another carrier.
→ Predicted Churn: 1 (Confidence: 1.00)
Text: This is the best customer service I’ve ever had.
→ Predicted Churn: 0 (Confidence: 0.98)
Text: I waited an hour and no one picked up. Terrible experience.
→ Predicted Churn: 1 (Confidence: 0.95)
Text: Thanks for helping me with my SIM issue!
→ Predicted Churn: 0 (Co

In [128]:
import joblib
joblib.dump(best_model, "ChurnFeedbackCLF.pkl")

['ChurnFeedbackCLF.pkl']