In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import classification_report, mean_squared_error
import numpy as np
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction import text

In [2]:
# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chloelam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Combine stop words from English, French, German, and Spanish
stop_words_en = text.ENGLISH_STOP_WORDS
stop_words_fr = stopwords.words('french')
stop_words_de = stopwords.words('german')
stop_words_es = stopwords.words('spanish')

In [4]:
# Create a combined list of stop words
stop_words_combined = list(set(stop_words_en).union(stop_words_fr).union(stop_words_de).union(stop_words_es))

In [5]:
df_fullsent = pd.read_csv('fulldata_sentiment_analysis_results.csv')
df_fullsent.sample(20)

Unnamed: 0,queue,priority,language,subcategory,subject,text,sentiment,sentiment_label,sentiment_score
608,HARDWARE,LOW,ES,JBL Quantum Duo Lautsprecher,Cambio de nombre en la factura - JBL Quantum Duo,¿Podrían colocar otro nombre en mi próxima fac...,"{'label': 'neutral', 'score': 0.8550178408622742}",neutral,0.855018
825,HARDWARE,MEDIUM,ES,USB-A-Stick,Problema con mi USB-A-Stick,"Hola equipo de soporte, recientemente compré u...","{'label': 'neutral', 'score': 0.8178384304046631}",neutral,0.817838
2032,SOFTWARE,HIGH,EN,Private Equity Software,Urgent: Private Equity Software Hacked - Immed...,"Dear Support Team,\nIt appears my Private Equi...","{'label': 'negative', 'score': 0.8433585166931...",negative,0.843359
4297,SOFTWARE,HIGH,EN,Contabo,Immediate Assistance Needed – Contabo server n...,"Hello Support Team, my Contabo server doesn't ...","{'label': 'negative', 'score': 0.8527371287345...",negative,0.852737
5869,HARDWARE,MEDIUM,DE,Bluetooth-Adapter,Probleme mit Bluetooth-Adapter,"Hallo Support-Team, ich verwende den bei Ihnen...","{'label': 'neutral', 'score': 0.7101664543151855}",neutral,0.710166
1487,HARDWARE,MEDIUM,ES,Liquid Cooling System,Problemas con el sistema de refrigeración líquida,"Hola equipo de soporte, tengo un pequeño probl...","{'label': 'neutral', 'score': 0.7753890156745911}",neutral,0.775389
4749,HARDWARE,HIGH,ES,JBL Quantum Duo Lautsprecher,Urgente: JBL Quantum Duo no se encienden,"Hola equipo de soporte, Estoy enfrentando un p...","{'label': 'neutral', 'score': 0.6330255270004272}",neutral,0.633026
2741,HARDWARE,HIGH,DE,DualSense Wireless Controller,DualSense Controller nicht einschalten nach Up...,Nach Ihrem letzten Update lässt sich mein Dual...,"{'label': 'positive', 'score': 0.5229312777519...",positive,0.522931
3256,ACCOUNTING,HIGH,EN,Employee Inquiries::IT Support::Access Rights,Urgent: Cannot Access Payroll System - Locked Out,"Dear IT team, I'm currently locked out of the ...","{'label': 'negative', 'score': 0.8640549778938...",negative,0.864055
5825,HARDWARE,MEDIUM,EN,Microscope,Microscope - Small Issue Noticed,I've noticed a minor issue while using my Micr...,"{'label': 'negative', 'score': 0.7271267175674...",negative,0.727127


In [6]:
# Preprocessing text data
vectorizer = TfidfVectorizer(stop_words=stop_words_combined, max_features=1000)
X = vectorizer.fit_transform(df_fullsent['text'])

In [7]:
# Encoding target variables
le_queue = LabelEncoder()
le_priority = LabelEncoder()
le_language = LabelEncoder()
le_sentiment_label = LabelEncoder()

y_queue = le_queue.fit_transform(df_fullsent['queue'])
y_priority = le_priority.fit_transform(df_fullsent['priority'])
y_language = le_language.fit_transform(df_fullsent['language'])
y_sentiment_label = le_sentiment_label.fit_transform(df_fullsent['sentiment_label'])
y_sentiment_score = df_fullsent['sentiment_score']

y_classification = pd.DataFrame({
    'queue': y_queue,
    'priority': y_priority,
    'language': y_language,
    'sentiment_label': y_sentiment_label
})

In [8]:
# Splitting the data
X_train, X_test, y_train_classification, y_test_classification, y_train_score, y_test_score = train_test_split(X, y_classification, y_sentiment_score, test_size=0.2, random_state=42)


In [9]:
# Model Training for classification targets
classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))
classifier.fit(X_train, y_train_classification)

In [10]:
# Model Training for regression target (sentiment_score)
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train_score)

In [12]:
 #Function to apply the trained model to new data and include new text data in the DataFrame
def apply_model_with_text(new_text_data, vectorizer, classifier, regressor, le_queue, le_priority, le_language, le_sentiment_label):
    # Preprocess the new text data
    X_new = vectorizer.transform(new_text_data)
    
    # Make predictions for classification targets
    y_pred_classification = classifier.predict(X_new)
    
    # Make predictions for regression target
    y_pred_score = regressor.predict(X_new)
    
    # Convert predictions to DataFrame
    y_pred_classification_df = pd.DataFrame(y_pred_classification, columns=['queue', 'priority', 'language', 'sentiment_label'])
    y_pred_classification_df['sentiment_score'] = y_pred_score
    
    # Inverse transform the predictions to get the original labels
    y_pred_classification_df['queue'] = le_queue.inverse_transform(y_pred_classification_df['queue'])
    y_pred_classification_df['priority'] = le_priority.inverse_transform(y_pred_classification_df['priority'])
    y_pred_classification_df['language'] = le_language.inverse_transform(y_pred_classification_df['language'])
    y_pred_classification_df['sentiment_label'] = le_sentiment_label.inverse_transform(y_pred_classification_df['sentiment_label'])
    
    # Include the new text data in the DataFrame
    y_pred_classification_df['text'] = new_text_data
    
    return y_pred_classification_df

# Example new text data
new_text_data = [
    "Hello, I'm having trouble with my DVD player. It won't turn on. Please help!",
    "Dear support team, the software update caused issues with my CRM system.",
    "My Intel NUC is not booting up after the latest update.",
    "Urgent: CyberLink PowerDirector 19 won't launch after installation.",
    "Problem with Apple App Store Connect - security settings need to be updated.",
    "Dringend: CyberLink PowerDirector 19 lässt sich nach der Installation nicht starten.",
    "CyberLink PowerDirector 19 ne se lance pas après l'installation.",
    "Urgente: CyberLink PowerDirector 19 no se inicia después de la instalación.",
    "DVDプレーヤーの調子が悪くて困っています。電源が入りません。助けてください！"
    "緊急です： CyberLink PowerDirector 19 がインストール後に起動しない。",
    "안녕하세요, DVD 플레이어에 문제가 있습니다. 켜지지 않습니다. 도와주세요!",
    "你好，我的 DVD 播放器有問題。它無法開啟。請幫幫忙！"
]

# Apply the model to new data and include new text data in the DataFrame
predictions_with_text = apply_model_with_text(new_text_data, vectorizer, classifier, regressor, le_queue, le_priority, le_language, le_sentiment_label)

# Display the predictions with text
predictions_with_text

Unnamed: 0,queue,priority,language,sentiment_label,sentiment_score,text
0,HARDWARE,HIGH,EN,negative,0.860114,"Hello, I'm having trouble with my DVD player. ..."
1,ACCOUNTING,MEDIUM,EN,neutral,0.776123,"Dear support team, the software update caused ..."
2,HARDWARE,MEDIUM,EN,neutral,0.828454,My Intel NUC is not booting up after the lates...
3,SOFTWARE,HIGH,EN,negative,0.846186,Urgent: CyberLink PowerDirector 19 won't launc...
4,HARDWARE,HIGH,EN,negative,0.854888,Problem with Apple App Store Connect - securit...
5,ACCOUNTING,HIGH,DE,neutral,0.685293,Dringend: CyberLink PowerDirector 19 lässt sic...
6,ACCOUNTING,LOW,FR,neutral,0.806971,CyberLink PowerDirector 19 ne se lance pas apr...
7,SOFTWARE,HIGH,ES,neutral,0.736822,Urgente: CyberLink PowerDirector 19 no se inic...
8,ACCOUNTING,LOW,EN,neutral,0.819955,DVDプレーヤーの調子が悪くて困っています。電源が入りません。助けてください！緊急です： C...
9,ACCOUNTING,LOW,EN,neutral,0.819955,"안녕하세요, DVD 플레이어에 문제가 있습니다. 켜지지 않습니다. 도와주세요!"


In [11]:
from sklearn.metrics import accuracy_score, f1_score, classification_report, mean_squared_error, r2_score

# 1. Predict on the test set for both models
y_pred_classification = classifier.predict(X_test)
y_pred_regression = regressor.predict(X_test)

# 2. Evaluate Classification Model Performance
print("Classification Model Performance:")

# Initialize dictionaries to store the metrics for each target
accuracy_scores = {}
f1_scores = {}

# Calculate accuracy and F1-score for each classification target
for i, column in enumerate(y_test_classification.columns):
    accuracy_scores[column] = accuracy_score(y_test_classification[column], y_pred_classification[:, i])
    f1_scores[column] = f1_score(y_test_classification[column], y_pred_classification[:, i], average='weighted')
    
    print(f"Performance for {column}:")
    print(f"  - Accuracy: {accuracy_scores[column]:.4f}")
    print(f"  - F1-Score: {f1_scores[column]:.4f}\n")

# Print detailed classification report for each target
print("Detailed Classification Report:")
for i, column in enumerate(y_test_classification.columns):
    print(f"Classification Report for {column}:\n")
    print(classification_report(y_test_classification[column], y_pred_classification[:, i]))
    print("\n" + "-"*80 + "\n")

# 3. Evaluate Regression Model Performance
print("Regression Model Performance:")
mse = mean_squared_error(y_test_score, y_pred_regression)
r2 = r2_score(y_test_score, y_pred_regression)

print(f"  - Mean Squared Error: {mse:.4f}")
print(f"  - R-Squared: {r2:.4f}")


Classification Model Performance:
Performance for queue:
  - Accuracy: 0.7729
  - F1-Score: 0.7736

Performance for priority:
  - Accuracy: 0.9541
  - F1-Score: 0.9541

Performance for language:
  - Accuracy: 0.9940
  - F1-Score: 0.9940

Performance for sentiment_label:
  - Accuracy: 0.8725
  - F1-Score: 0.8595

Detailed Classification Report:
Classification Report for queue:

              precision    recall  f1-score   support

           0       0.71      0.76      0.74       531
           1       0.84      0.81      0.83       570
           2       0.76      0.75      0.76       577

    accuracy                           0.77      1678
   macro avg       0.77      0.77      0.77      1678
weighted avg       0.77      0.77      0.77      1678


--------------------------------------------------------------------------------

Classification Report for priority:

              precision    recall  f1-score   support

           0       0.95      0.97      0.96       569
          