In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.feature_extraction import text
from nltk.corpus import stopwords
import nltk




In [3]:
# Download stop words
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chloelam/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Combine stop words from English, French, German, and Spanish
stop_words_en = text.ENGLISH_STOP_WORDS
stop_words_fr = stopwords.words('french')
stop_words_de = stopwords.words('german')
stop_words_es = stopwords.words('spanish')

In [5]:



# Create a combined list of stop words
stop_words_combined = list(set(stop_words_en).union(stop_words_fr).union(stop_words_de).union(stop_words_es))

In [6]:
df_fullsent = pd.read_csv('fulldata_sentiment_analysis_results.csv')
df_fullsent.sample(20)

Unnamed: 0,queue,priority,language,subcategory,subject,text,sentiment,sentiment_label,sentiment_score
3673,HARDWARE,LOW,DE,Corsair Flash Survivor Stealth 128GB,Anfrage bezüglich Corsair Flash Survivor Steal...,"Guten Tag,\nIch benutze Ihren Corsair Flash Su...","{'label': 'neutral', 'score': 0.701138436794281}",neutral,0.701138
359,HARDWARE,MEDIUM,FR,Tragbarer Mediaplayer,Problème avec mon lecteur multimédia portable,"Bonjour, mon lecteur multimédia portable a un ...","{'label': 'neutral', 'score': 0.6696497797966003}",neutral,0.66965
6166,HARDWARE,LOW,EN,D-Link DIR-895L,Query about Upcoming Invoice and Hardware Inqu...,"Hi there,\n\nI hope this message finds you wel...","{'label': 'neutral', 'score': 0.5801788568496704}",neutral,0.580179
3987,SOFTWARE,HIGH,ES,Penetration Testing,Urgente: Software de Penetration Testing no fu...,"Hola equipo de soporte, estamos teniendo probl...","{'label': 'neutral', 'score': 0.672933042049408}",neutral,0.672933
7180,SOFTWARE,MEDIUM,DE,FatCow,Angebot zur Softwareentwicklung,"Guten Tag, ich würde gern ein Angebot für die ...","{'label': 'neutral', 'score': 0.7621254324913025}",neutral,0.762125
1184,ACCOUNTING,HIGH,DE,Employee Inquiries::Technical,Dringend: Unberechtigter Zugriff auf mein Konto,"Sehr geehrtes Support-Team, von meinem Bankkon...","{'label': 'neutral', 'score': 0.589259684085846}",neutral,0.58926
3066,HARDWARE,LOW,EN,Smart Thermostat,Minor Issue with my Smart Thermostat,"Hello Support Team,I've noticed a small glitch...","{'label': 'negative', 'score': 0.5764420628547...",negative,0.576442
4200,ACCOUNTING,MEDIUM,EN,Customer Inquiries,Request for software development proposal,"Dear Support Team, as a new customer, I'm inte...","{'label': 'neutral', 'score': 0.5463802218437195}",neutral,0.54638
7689,ACCOUNTING,MEDIUM,ES,Employee Inquiries::IT Support,Solicitud de actualización de software,"Estimado equipo, he experimentado algunos erro...","{'label': 'neutral', 'score': 0.8641172051429749}",neutral,0.864117
4808,HARDWARE,LOW,EN,AMD Radeon RX 6900 XT,AMD Radeon RX 6900 XT - Invoice Update and Fee...,"Hi team, my AMD Radeon RX 6900 XT has been per...","{'label': 'positive', 'score': 0.939770519733429}",positive,0.939771


In [7]:
# Preprocessing text data
vectorizer = TfidfVectorizer(stop_words=stop_words_combined, max_features=1000)
X = vectorizer.fit_transform(df_fullsent['text'])




In [8]:
# Encoding target variables
le_queue = LabelEncoder()
le_priority = LabelEncoder()
le_language = LabelEncoder()
le_sentiment_label = LabelEncoder()

y_queue = le_queue.fit_transform(df_fullsent['queue'])
y_priority = le_priority.fit_transform(df_fullsent['priority'])
y_language = le_language.fit_transform(df_fullsent['language'])
y_sentiment_label = le_sentiment_label.fit_transform(df_fullsent['sentiment_label'])
y_sentiment_score = df_fullsent['sentiment_score']

y_classification = pd.DataFrame({
    'queue': y_queue,
    'priority': y_priority,
    'language': y_language,
    'sentiment_label': y_sentiment_label
})

In [9]:
# Splitting the data
X_train, X_test, y_train_classification, y_test_classification, y_train_score, y_test_score = train_test_split(X, y_classification, y_sentiment_score, test_size=0.2, random_state=42)


In [10]:
# Model Training for classification targets
classifier = MultiOutputClassifier(RandomForestClassifier(random_state=42))
classifier.fit(X_train, y_train_classification)

In [11]:
# Model Training for regression target (sentiment_score)
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train_score)



In [21]:
# Function to apply the trained model to new data and include new text data in the DataFrame
def apply_model_with_text(new_text_data, vectorizer, classifier, regressor, le_queue, le_priority, le_language, le_sentiment_label):
    # Preprocess the new text data
    X_new = vectorizer.transform(new_text_data)
    
    # Make predictions for classification targets
    y_pred_classification = classifier.predict(X_new)
    
    # Make predictions for regression target
    y_pred_score = regressor.predict(X_new)
    
    # Convert predictions to DataFrame
    y_pred_classification_df = pd.DataFrame(y_pred_classification, columns=['queue', 'priority', 'language', 'sentiment_label'])
    y_pred_classification_df['sentiment_score'] = y_pred_score
    
    # Inverse transform the predictions to get the original labels
    y_pred_classification_df['queue'] = le_queue.inverse_transform(y_pred_classification_df['queue'])
    y_pred_classification_df['priority'] = le_priority.inverse_transform(y_pred_classification_df['priority'])
    y_pred_classification_df['language'] = le_language.inverse_transform(y_pred_classification_df['language'])
    y_pred_classification_df['sentiment_label'] = le_sentiment_label.inverse_transform(y_pred_classification_df['sentiment_label'])
    
    # Include the new text data in the DataFrame
    y_pred_classification_df['text'] = new_text_data
    
    return y_pred_classification_df

# Example new text data
new_text_data = [
    "Hello, I'm having trouble with my DVD player. It won't turn on. Please help!",
    "Dear support team, the software update caused issues with my CRM system.",
    "My Intel NUC is not booting up after the latest update.",
    "Urgent: CyberLink PowerDirector 19 won't launch after installation.",
    "Problem with Apple App Store Connect - security settings need to be updated.",
    "Dringend: CyberLink PowerDirector 19 lässt sich nach der Installation nicht starten.",
    "CyberLink PowerDirector 19 ne se lance pas après l'installation.",
    "Urgente: CyberLink PowerDirector 19 no se inicia después de la instalación.",
    "DVDプレーヤーの調子が悪くて困っています。電源が入りません。助けてください！"
    "緊急です： CyberLink PowerDirector 19 がインストール後に起動しない。",
    "안녕하세요, DVD 플레이어에 문제가 있습니다. 켜지지 않습니다. 도와주세요!",
    "你好，我的 DVD 播放器有問題。它無法開啟。請幫幫忙！"
]

# Apply the model to new data and include new text data in the DataFrame
predictions_with_text = apply_model_with_text(new_text_data, vectorizer, classifier, regressor, le_queue, le_priority, le_language, le_sentiment_label)

# Display the predictions with text
predictions_with_text

Unnamed: 0,queue,priority,language,sentiment_label,sentiment_score,text
0,HARDWARE,HIGH,EN,negative,0.860114,"Hello, I'm having trouble with my DVD player. ..."
1,ACCOUNTING,MEDIUM,EN,neutral,0.776123,"Dear support team, the software update caused ..."
2,HARDWARE,MEDIUM,EN,neutral,0.828454,My Intel NUC is not booting up after the lates...
3,SOFTWARE,HIGH,EN,negative,0.846186,Urgent: CyberLink PowerDirector 19 won't launc...
4,HARDWARE,HIGH,EN,negative,0.854888,Problem with Apple App Store Connect - securit...
5,ACCOUNTING,HIGH,DE,neutral,0.685293,Dringend: CyberLink PowerDirector 19 lässt sic...
6,ACCOUNTING,LOW,FR,neutral,0.806971,CyberLink PowerDirector 19 ne se lance pas apr...
7,SOFTWARE,HIGH,ES,neutral,0.736822,Urgente: CyberLink PowerDirector 19 no se inic...
8,ACCOUNTING,LOW,EN,neutral,0.819955,DVDプレーヤーの調子が悪くて困っています。電源が入りません。助けてください！緊急です： C...
9,ACCOUNTING,LOW,EN,neutral,0.819955,"안녕하세요, DVD 플레이어에 문제가 있습니다. 켜지지 않습니다. 도와주세요!"
