In [1]:
import pandas as pd

In [2]:
# Read the data from the CSV file
df = pd.read_csv(r'c:\Users\Victus\Desktop\AI Email Assistant\data\twcs\cleaned_twcs.csv', encoding='utf-8')
df.head()

Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id,cleaned_text
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0,understand would like assist would need privat...
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0,sprintcar
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0,sprintcar send sever privat messag one respond...
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0,send messag assist just click top profil
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0,sprintcar


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39675 entries, 0 to 39674
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tweet_id                 39675 non-null  int64  
 1   author_id                39675 non-null  object 
 2   inbound                  39675 non-null  bool   
 3   created_at               39675 non-null  object 
 4   text                     39675 non-null  object 
 5   response_tweet_id        26619 non-null  object 
 6   in_response_to_tweet_id  29462 non-null  float64
 7   cleaned_text             39289 non-null  object 
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 2.2+ MB


In [4]:
df['author_id'].value_counts().reset_index()

Unnamed: 0,author_id,count
0,AmazonHelp,2641
1,AppleSupport,1187
2,Uber_Support,694
3,ChipotleTweets,684
4,British_Airways,526
...,...,...
11038,120523,1
11039,120522,1
11040,120520,1
11041,120519,1


In [5]:
df = df.drop(['text', 'tweet_id', 'created_at', 'response_tweet_id', 'in_response_to_tweet_id'], axis=1)

In [6]:
print(df.head())

    author_id  inbound                                       cleaned_text
0  sprintcare    False  understand would like assist would need privat...
1      115712     True                                          sprintcar
2      115712     True  sprintcar send sever privat messag one respond...
3  sprintcare    False           send messag assist just click top profil
4      115712     True                                          sprintcar


In [7]:
df['cleaned_text'] = df['cleaned_text'].fillna('')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39675 entries, 0 to 39674
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   author_id     39675 non-null  object
 1   inbound       39675 non-null  bool  
 2   cleaned_text  39675 non-null  object
dtypes: bool(1), object(2)
memory usage: 658.8+ KB


## Feature Extraction

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

df = df[df["inbound"] == True]

vectorizer = CountVectorizer(max_features=500, stop_words="english")  # 5000 en sık geçen kelimeyi al
X = vectorizer.fit_transform(df["cleaned_text"])  # Metni sayısal forma çevir

In [10]:
from sklearn.decomposition import LatentDirichletAllocation

# LDA modelini oluştur ve eğit
num_topics = 5  # Çıkarılacak konu sayısı (istersen artırabiliriz)
lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
lda.fit(X)

In [11]:
# En sık geçen kelimeleri içeren konuları göster
def print_topics(model, vectorizer, top_n=10):
    words = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(model.components_):
        print(f"📌 **Konu {idx+1}:** ", [words[i] for i in topic.argsort()[:-top_n - 1:-1]])

print_topics(lda, vectorizer)

📌 **Konu 1:**  ['servic', 'custom', 'email', 'say', 'number', 'internet', 'askspectrum', 'phone', 'comcastcar', 'receiv']
📌 **Konu 2:**  ['flight', 'britishairway', 'time', 'delta', 'americanair', 'day', 'southwestair', 'thank', 'im', 'virgintrain']
📌 **Konu 3:**  ['need', 'ubersupport', 'amazon', 'order', 'charg', 'want', 'dont', 'account', 'cancel', 'ideacar']
📌 **Konu 4:**  ['thank', 'tesco', 'o2', 'sainsburi', 'airasiasupport', 'marksandspenc', 'check', 'plea', 'morrison', 'xboxsupport']
📌 **Konu 5:**  ['applesupport', 'chipotletweet', 'updat', 'work', 'app', 'new', 'phone', 'tri', 'fix', 'iphon']


In [12]:
import numpy as np

# Her e-posta için en olası konuyu belirle
topic_results = lda.transform(X)
df["topic"] = np.argmax(topic_results, axis=1)

# İlk 10 satırı göster
print(df[["cleaned_text", "topic"]].head(10))

                                         cleaned_text  topic
1                                           sprintcar      0
2   sprintcar send sever privat messag one respond...      0
4                                           sprintcar      0
6                         sprintcar bad custom servic      0
8      sprintcar you gonna chang connect whole famili      4
10                          sprintcar sinc sign day 1      0
12  115714 y’all lie connect 5 lte still won’t loa...      0
14  115714 whenev contact custom support tell enab...      0
16  askspectrum would like email copi one sinc spe...      0
18    askspectrum receiv corpor offic would like copi      0


In [13]:
topic_labels = {
    0: "General Support",
    1: "Fly / Airline Issues",
    2: "Order / Payment Issues",
    3: "Retail / Grocery Store Complaints",
    4: "Tech Support"
}

# Konu numaralarını anlamlı etiketlere çevir
df["topic_label"] = df["topic"].map(topic_labels)

# İlk 10 satırı göster
print(df[["cleaned_text", "topic_label"]].head(10))

                                         cleaned_text      topic_label
1                                           sprintcar  General Support
2   sprintcar send sever privat messag one respond...  General Support
4                                           sprintcar  General Support
6                         sprintcar bad custom servic  General Support
8      sprintcar you gonna chang connect whole famili     Tech Support
10                          sprintcar sinc sign day 1  General Support
12  115714 y’all lie connect 5 lte still won’t loa...  General Support
14  115714 whenev contact custom support tell enab...  General Support
16  askspectrum would like email copi one sinc spe...  General Support
18    askspectrum receiv corpor offic would like copi  General Support


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Veri kümesini temizleyelim
df = df.dropna(subset=["cleaned_text", "topic_label"])  # Boş satırları sil

# X (metin) ve Y (etiket) olarak ayır
X = df["cleaned_text"]
y = df["topic_label"]

# TF-IDF ile metni vektörleştir
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X)


import joblib

# Eğitilmiş TF-IDF vektörleştiriciyi kaydet
joblib.dump(vectorizer, r'C:\Users\Victus\Desktop\AI Email Assistant\models\tfidf_vectorizer.pkl')

# Eğitim ve test setlerine ayır
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

print("Veri başarıyla hazırlandı! 🚀")
print("Eğitim kümesi boyutu:", X_train.shape)
print("Test kümesi boyutu:", X_test.shape)


Veri başarıyla hazırlandı! 🚀
Eğitim kümesi boyutu: (17382, 2570)
Test kümesi boyutu: (4346, 2570)


In [15]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

joblib.dump(label_encoder, r'C:\Users\Victus\Desktop\AI Email Assistant\models\label_encoder.pkl')

model_xgb = XGBClassifier(colsample_bytree=0.8, learning_rate=0.1, max_depth=9, n_estimators=3, use_label_encoder=False, eval_metric="mlogloss")
model_xgb.fit(X_train, y_train_encoded)

y_pred_xgb = model_xgb.predict(X_test)

print("📊 XGBoost Doğruluk Skoru:", accuracy_score(y_test_encoded, y_pred_xgb))
print(classification_report(y_test_encoded, y_pred_xgb, target_names=label_encoder.classes_))


Parameters: { "use_label_encoder" } are not used.



📊 XGBoost Doğruluk Skoru: 0.655085135757018
                                   precision    recall  f1-score   support

             Fly / Airline Issues       0.84      0.57      0.68       859
                  General Support       0.47      0.82      0.60      1011
           Order / Payment Issues       0.75      0.54      0.63       965
Retail / Grocery Store Complaints       0.68      0.61      0.64       518
                     Tech Support       0.82      0.69      0.75       993

                         accuracy                           0.66      4346
                        macro avg       0.71      0.65      0.66      4346
                     weighted avg       0.71      0.66      0.66      4346



In [16]:
# from sklearn.model_selection import GridSearchCV
# from xgboost import XGBClassifier

# # XGBoost modelini oluştur
# model_xgb = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss")

# # Denenecek parametreler
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 6, 9],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0]
# }

# # GridSearchCV ile hiperparametre optimizasyonu
# grid_search = GridSearchCV(estimator=model_xgb, param_grid=param_grid, 
#                            scoring='accuracy', cv=3, verbose=1, n_jobs=-1)

# # Eğitim verisiyle hiperparametre optimizasyonu yap
# grid_search.fit(X_train, y_train_encoded)



# # En iyi parametreleri ve doğruluk skorunu yazdır
# print("En iyi parametreler:", grid_search.best_params_)
# print("En iyi doğruluk skoru:", grid_search.best_score_)

# # En iyi model ile tahmin yap
# best_model = grid_search.best_estimator_
# y_pred_best = best_model.predict(X_test)

# # Sonuçları yazdır
# print("📊 İyileştirilmiş XGBoost Doğruluk Skoru:", accuracy_score(y_test_encoded, y_pred_best))
# print(classification_report(y_test_encoded, y_pred_best, target_names=label_encoder.classes_))


In [17]:
import joblib
joblib.dump(model_xgb, 'xgboost_email_classifier.pkl')

['xgboost_email_classifier.pkl']

In [18]:
import joblib

# Modeli yükle
model = joblib.load('xgboost_email_classifier.pkl')


In [19]:
import nltk
nltk.download('punkt')

# Tokenizasyon işlemi
def tokenize(text):
    return nltk.word_tokenize(text)


def preprocess_with_tokenization(text):
    # Temizleme (örneğin, özel karakterleri silme, küçük harfe çevirme, vs.)
    text = text.lower()
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    
    # Tokenizasyon işlemi
    tokens = tokenize(text)
    
    # Stopwords temizleme (isteğe bağlı)
    stopwords = nltk.corpus.stopwords.words('english')
    tokens = [word for word in tokens if word not in stopwords]
    
    # Sonuç
    return " ".join(tokens)

# Örnek kullanım
cleaned_email = preprocess_with_tokenization("I have an issue with my flight booking and need assistance.")
print(cleaned_email)


issue flight booking need assistance


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Victus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Yeni gelen e-posta mesajı (örnek)
new_email = "I have an issue with my flight and I need help. Issue is my check-up service is suck."

# Vektörleştirme (same transformer used during training)
cleaned_email_vectorized = vectorizer.transform([cleaned_email])

# Model ile tahmin yap
predicted_class = model.predict(cleaned_email_vectorized)

# Tahminin sonucu
predicted_label = label_encoder.inverse_transform(predicted_class)
print(f"Predicted Class: {predicted_label[0]}")

Predicted Class: Order / Payment Issues
