In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import make_scorer, accuracy_score, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from sklearn.feature_extraction import text
import nltk


In [5]:
# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chloelam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
# Combine stop words from English, French, German, Spanish, Chinese, Japanese, and Korean
stop_words_en = text.ENGLISH_STOP_WORDS
stop_words_fr = stopwords.words('french')
stop_words_de = stopwords.words('german')
stop_words_es = stopwords.words('spanish')
# For Chinese, Japanese, Korean, we need to manually specify or use external libraries as nltk does not provide them.
stop_words_zh = set()  # Placeholder, add actual stop words
stop_words_ja = set()  # Placeholder, add actual stop words
stop_words_ko = set()  # Placeholder, add actual stop words

In [7]:
# Create a combined list of stop words
stop_words_combined = list(set(stop_words_en).union(stop_words_fr).union(stop_words_de).union(stop_words_es).union(stop_words_zh).union(stop_words_ja).union(stop_words_ko))


In [8]:
df_fullsent = pd.read_csv('fulldata_sentiment_analysis_results.csv')
df_fullsent.sample(20)

Unnamed: 0,queue,priority,language,subcategory,subject,text,sentiment,sentiment_label,sentiment_score
5353,ACCOUNTING,HIGH,DE,Employee Inquiries::Accounting,Dringend – Einzugsermächtigung korrigieren,"Sehr geehrtes Team, bitte korrigieren Sie mein...","{'label': 'neutral', 'score': 0.583605170249939}",neutral,0.583605
452,SOFTWARE,MEDIUM,ES,Messaging Apps,Actualización de Versión de Aplicación de Mens...,"Hola equipo de soporte, he observado algunos e...","{'label': 'neutral', 'score': 0.7921431064605713}",neutral,0.792143
63,HARDWARE,MEDIUM,EN,Smart-Kinderwagen,Issue with Smart Kinderwagen,The Smart-Kinderwagen display has freezed. Reb...,"{'label': 'negative', 'score': 0.8509384989738...",negative,0.850938
8353,HARDWARE,LOW,ES,Smart-Luftentfeuchter,Detalles de la Factura - Cambio de nombre de e...,Quería saber si es posible actualizar la factu...,"{'label': 'neutral', 'score': 0.8107450604438782}",neutral,0.810745
1725,ACCOUNTING,MEDIUM,ES,Employee Inquiries::IT Support::Network Issues,Consulta sobre actualización de software y err...,"Hola equipo de soporte, recientemente he notad...","{'label': 'neutral', 'score': 0.7304739356040955}",neutral,0.730474
409,ACCOUNTING,HIGH,EN,Employee Inquiries::Legal Inquiries::Labor Law,Urgent: Legal Inquiry Regarding Labor Law,"Dear Support, I urgently need to speak to some...","{'label': 'neutral', 'score': 0.5330340266227722}",neutral,0.533034
1696,HARDWARE,LOW,FR,360-Grad-Kamera,Demande de mise à jour - Caméra 360 degrés,"Bonjour, j'aimerais savoir si une mise à jour ...","{'label': 'neutral', 'score': 0.7224858403205872}",neutral,0.722486
6399,SOFTWARE,MEDIUM,EN,Illustrator 2022,Illustrator 2022 - Upgrade Inquiry,"Hi there, I'm currently using Adobe Illustrato...","{'label': 'neutral', 'score': 0.6901986002922058}",neutral,0.690199
1947,SOFTWARE,HIGH,EN,Data Privacy,URGENT: Can't Sign Into Data Privacy Account,My Data Privacy account appears to be compromi...,"{'label': 'negative', 'score': 0.8848888278007...",negative,0.884889
3099,HARDWARE,HIGH,EN,DualSense Wireless Controller,Help! DualSense Controller won't turn on,My DualSense Wireless Controller won't turn on...,"{'label': 'negative', 'score': 0.8186317682266...",negative,0.818632


In [9]:
# Preprocessing text data
vectorizer = TfidfVectorizer(stop_words=stop_words_combined, max_features=1000)
X = vectorizer.fit_transform(df_fullsent['text'])

In [10]:
# Encoding target variables
le_queue = LabelEncoder()
le_priority = LabelEncoder()
le_language = LabelEncoder()
le_sentiment_label = LabelEncoder()

y_queue = le_queue.fit_transform(df_fullsent['queue'])
y_priority = le_priority.fit_transform(df_fullsent['priority'])
y_language = le_language.fit_transform(df_fullsent['language'])
y_sentiment_label = le_sentiment_label.fit_transform(df_fullsent['sentiment_label'])
y_sentiment_score = df_fullsent['sentiment_score']

y_classification = pd.DataFrame({
    'queue': y_queue,
    'priority': y_priority,
    'language': y_language,
    'sentiment_label': y_sentiment_label
})


In [11]:
# Splitting the data into training and testing sets
X_train, X_test, y_train_classification, y_test_classification, y_train_score, y_test_score = train_test_split(
    X, y_classification, y_sentiment_score, test_size=0.2, random_state=42
)

In [12]:
# Model Training for classification targets
classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))
classifier.fit(X_train, y_train_classification)

In [13]:
# Model Training for regression target (sentiment_score)
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train_score)

In [14]:
# Function to apply the trained model to new data and include new text data in the DataFrame
def apply_model_with_text(new_text_data, vectorizer, classifier, regressor, le_queue, le_priority, le_language, le_sentiment_label):
    # Preprocess the new text data
    X_new = vectorizer.transform(new_text_data)
    
    # Make predictions for classification targets
    y_pred_classification = classifier.predict(X_new)
    
    # Make predictions for regression target
    y_pred_score = regressor.predict(X_new)
    
    # Convert predictions to DataFrame
    y_pred_classification_df = pd.DataFrame(y_pred_classification, columns=['queue', 'priority', 'language', 'sentiment_label'])
    y_pred_classification_df['sentiment_score'] = y_pred_score
    
    # Inverse transform the predictions to get the original labels
    y_pred_classification_df['queue'] = le_queue.inverse_transform(y_pred_classification_df['queue'])
    y_pred_classification_df['priority'] = le_priority.inverse_transform(y_pred_classification_df['priority'])
    y_pred_classification_df['language'] = le_language.inverse_transform(y_pred_classification_df['language'])
    y_pred_classification_df['sentiment_label'] = le_sentiment_label.inverse_transform(y_pred_classification_df['sentiment_label'])
    
    # Include the new text data in the DataFrame
    y_pred_classification_df['text'] = new_text_data
    
    return y_pred_classification_df


In [15]:
# Example new text data including Chinese, Japanese, and Korean
new_text_data = [
    "Hello, I'm having trouble with my DVD player. It won't turn on. Please help!",
    "Dear support team, the software update caused issues with my CRM system.",
    "My Intel NUC is not booting up after the latest update.",
    "Urgent: CyberLink PowerDirector 19 won't launch after installation.",
    "Problem with Apple App Store Connect - security settings need to be updated.",
    "你好，我的DVD播放器有问题。 它无法启动。 请帮忙！",  # Chinese
    "こんにちは、DVDプレーヤーの調子が悪いです。 電源が入りません。 助けてください！",  # Japanese
    "안녕하세요, 제 DVD 플레이어에 문제가 있습니다. 켜지지 않습니다. 도와주세요!"  # Korean
]

# Apply the model to new data and include new text data in the DataFrame
predictions_with_text = apply_model_with_text(new_text_data, vectorizer, classifier, regressor, le_queue, le_priority, le_language, le_sentiment_label)

# Display the predictions with text
predictions_with_text

Unnamed: 0,queue,priority,language,sentiment_label,sentiment_score,text
0,HARDWARE,HIGH,EN,negative,0.860114,"Hello, I'm having trouble with my DVD player. ..."
1,ACCOUNTING,MEDIUM,EN,neutral,0.776123,"Dear support team, the software update caused ..."
2,HARDWARE,MEDIUM,EN,neutral,0.828454,My Intel NUC is not booting up after the lates...
3,SOFTWARE,HIGH,EN,negative,0.846186,Urgent: CyberLink PowerDirector 19 won't launc...
4,HARDWARE,HIGH,EN,negative,0.854888,Problem with Apple App Store Connect - securit...
5,ACCOUNTING,LOW,EN,neutral,0.819955,你好，我的DVD播放器有问题。 它无法启动。 请帮忙！
6,ACCOUNTING,LOW,EN,neutral,0.819955,こんにちは、DVDプレーヤーの調子が悪いです。 電源が入りません。 助けてください！
7,ACCOUNTING,LOW,EN,neutral,0.819955,"안녕하세요, 제 DVD 플레이어에 문제가 있습니다. 켜지지 않습니다. 도와주세요!"
