# Import Required Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer
import xgboost as xgb


# Load the Datasets

In [None]:
# Load datasets
calls = pd.read_csv('../data/calls.csv')
customers = pd.read_csv('../data/customers.csv')
reason = pd.read_csv('../data/reason.csv')
sentiment = pd.read_csv('../data/sentiment_statistics.csv')


# Data Preprocessing

In [None]:
# Convert dates to datetime
calls['call_start_datetime'] = pd.to_datetime(calls['call_start_datetime'])
calls['call_end_datetime'] = pd.to_datetime(calls['call_end_datetime'])

# Calculate call duration
calls['call_duration'] = (calls['call_end_datetime'] - calls['call_start_datetime']).dt.total_seconds()

# Merge datasets
merged_df = calls.merge(customers, on='customer_id', how='left')
merged_df = merged_df.merge(reason, on='call_id', how='left')
merged_df = merged_df.merge(sentiment, on='call_id', how='left')

# Save the preprocessed data
merged_df.to_csv('../data/processed_data.csv', index=False)


# Load the Preprocessed Data

In [None]:
# Load the preprocessed data
data = pd.read_csv('../data/processed_data.csv')

print(data.info())
print(data.describe())


# Convert Datetime Columns and Calculate Response Delay

In [None]:
# Convert datetime columns
data['call_start_datetime'] = pd.to_datetime(data['call_start_datetime'], errors='coerce')
data['agent_assigned_datetime'] = pd.to_datetime(data['agent_assigned_datetime'], errors='coerce')

# Calculate response delay if it doesn't exist
if 'response_delay' not in data.columns:
    data['response_delay'] = (data['agent_assigned_datetime'] - data['call_start_datetime']).dt.total_seconds() / 60


# Calculate AHT and AST

In [None]:
# Calculate Average Handle Time (AHT) and Average Speed to Answer (AST)
AHT = data['call_duration'].mean()
AST = data['response_delay'].mean()

# Output the AST and AHT values
print(f"Average Handle Time (AHT): {AHT} seconds")
print(f"Average Speed to Answer (AST): {AST} minutes")


# Exploratory Data Analysis (EDA)

In [None]:
# Plot call durations
plt.figure(figsize=(10, 6))
sns.histplot(data['call_duration'], kde=True)
plt.title('Distribution of Call Durations')
plt.xlabel('Call Duration (seconds)')
plt.ylabel('Frequency')
plt.show()

# Plot heatmap for correlation
plt.figure(figsize=(10, 8))
sns.heatmap(data.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Plot distribution of target variable
plt.figure(figsize=(10, 6))
sns.countplot(y='primary_call_reason', data=data, order=data['primary_call_reason'].value_counts().index)
plt.title('Distribution of Primary Call Reason')
plt.xlabel('Count')
plt.ylabel('Primary Call Reason')
plt.show()


# Create New Features

In [None]:
# Create new features
data['day_of_week'] = data['call_start_datetime'].dt.day_name()
data['call_duration_category'] = pd.cut(data['call_duration'], bins=[0, 60, 300, 600, 1800],
                                         labels=['Very Short', 'Short', 'Medium', 'Long'])

# Save the final dataset for modeling
data.to_csv('../data/feature_engineered_data.csv', index=False)
print("Feature engineering complete and dataset saved.")


# Load the Feature-Engineered Dataset

In [None]:
# Load the feature-engineered dataset
data = pd.read_csv('../data/feature_engineered_data.csv')

# Features and target
X = data[['call_duration', 'response_delay', 'average_sentiment']]  # Add more features if needed
y = data['primary_call_reason'].fillna('Unknown').str.strip()  # Clean target variable


# Handle Missing Values


In [None]:
# Handle missing values in X by imputing them with the median
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# Convert target variable to categorical codes
y = pd.factorize(y)[0]


# Train-Test Split

In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)


# Model Training with XGBoost

In [None]:
# Model Training
model = xgb.XGBClassifier(eval_metric='mlogloss')
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.2f}")

# Print classification report
print(classification_report(y_test, y_pred))


# Save the XGBoost Model


In [None]:
# Save the model
model_dir = '../models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
joblib.dump(model, os.path.join(model_dir, 'xgboost_model.pkl'))


# Load the Test Data and Make Predictions

In [None]:
# Load the test data
test_data = pd.read_csv('../data/test.csv')

# Load the trained model
model = joblib.load(os.path.join(model_dir, 'xgboost_model.pkl'))

# Define the expected features based on your training data
expected_features = ['call_duration', 'response_delay', 'average_sentiment']

# Create placeholder values for missing features if the test data doesn't have them
for feature in expected_features:
    if feature not in test_data.columns:
        test_data[feature] = np.random.rand(len(test_data))  # Generate random values or set to some default

# Prepare the test data with all the expected features
X_test = test_data[expected_features]

# Convert to NumPy array for prediction
X_test_array = X_test.to_numpy()

# Make predictions
test_predictions = model.predict(X_test_array)

# Create submission file
submission = pd.DataFrame({'call_id': test_data['call_id'], 'primary_call_reason': test_predictions})
submission.to_csv('../predictions/test_ankitasingh.csv', index=False)

print("Predictions made and submission file created successfully.")


# Load and Display Predictions

In [None]:
# Load the submission file
submission = pd.read_csv('../predictions/test_ankitasingh.csv')
print(submission.head())  # Display the first few predictions
