In [2]:
# Required packages should be installed first
#!pip install torch numpy transformers datasets tiktoken wandb tqdm matplotlib seaborn scikit-learn

Collecting matplotlib
  Using cached matplotlib-3.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (101 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Using cached kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.2 kB)
Collecting pillow>=8 (from matplotlib)
  Using cached pillow-11.

After the work environment was initialized, Exploratory Data Analysis is conducted for the input data. In this step, ...TODO

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency

# Load dataset
train_file = os.path.join(os.path.dirname(__file__), 'train.csv')
df_train = pd.read_csv(train_file)

# Display dataset info
print("Train Dataset Info:")
df_train.info()

# Display first few rows
print("\nTrain Dataset Sample:")
print(df_train.head())

# Sentiment distribution
plt.figure(figsize=(6, 6))
sns.countplot(data=df_train, x='customer_sentiment', palette="coolwarm")
plt.title("Distribution of Customer Sentiment")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

# Correlation analysis (categorical features)
categorical_features = ['issue_area', 'issue_category', 'issue_sub_category',
                        'product_category', 'product_sub_category', 'issue_complexity',
                        'agent_experience_level']

for col in categorical_features:
    plt.figure(figsize=(8, 8))
    sns.countplot(data=df_train, x=col, hue='customer_sentiment', palette="coolwarm")
    plt.title(f"Sentiment Distribution by {col}")
    plt.xticks(rotation=45, ha='right')
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.legend(title='Sentiment')
    plt.show()


# Correlation analysis using Chi-Square test
def chi_square_test(df, feature, target):
    contingency_table = pd.crosstab(df[feature], df[target])
    chi2, p, _, _ = chi2_contingency(contingency_table)
    return chi2, p

correlation_results = {}
for col in categorical_features:
    chi2, p = chi_square_test(df_train, col, 'customer_sentiment')
    correlation_results[col] = {'Chi2 Score': chi2, 'P-Value': p}

# Display most influential parameters
influential_params = sorted(correlation_results.items(), key=lambda x: x[1]['Chi2 Score'], reverse=True)
print("\nMost Influential Parameters on Customer Sentiment:")
for param, values in influential_params:
    print(f"{param}: Chi2 Score = {values['Chi2 Score']:.2f}, P-Value = {values['P-Value']:.5f}")

# Visualizing Correlation Scores
plt.figure(figsize=(10, 10))
params = [param[0] for param in influential_params]
chi2_scores = [param[1]['Chi2 Score'] for param in influential_params]
sns.barplot(x=params, y=chi2_scores, palette="coolwarm")
plt.xticks(rotation=45, ha='right')
plt.title("Feature Importance Based on Chi-Square Scores")
plt.xlabel("Features")
plt.ylabel("Chi-Square Score")
plt.show()