In [12]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

calls_df = pd.read_csv(r"C:\Users\hp\Downloads\callsf0d4f5a.csv")
test_df = pd.read_csv(r"C:\Users\hp\Downloads\testbc7185d.csv")
sentiment_df = pd.read_csv(r"C:\Users\hp\Downloads\sentiment_statisticscc1e57a.csv")
reason_df = pd.read_csv(r"C:\Users\hp\Downloads\reason18315ff.csv")
customers_df = pd.read_csv(r"C:\Users\hp\Downloads\customers2afd6ea.csv")

calls_df['call_start_datetime'] = pd.to_datetime(calls_df['call_start_datetime'])
calls_df['agent_assigned_datetime'] = pd.to_datetime(calls_df['agent_assigned_datetime'], errors='coerce')
calls_df['call_end_datetime'] = pd.to_datetime(calls_df['call_end_datetime'], errors='coerce')

if 'agent_assigned_datetime' in calls_df.columns:
    calls_df['AHT'] = (calls_df['call_end_datetime'] - calls_df['agent_assigned_datetime']).dt.total_seconds()
    calls_df['AST'] = (calls_df['agent_assigned_datetime'] - calls_df['call_start_datetime']).dt.total_seconds()
else:
    calls_df['AHT'] = np.nan
    calls_df['AST'] = np.nan

calls_sentiment = pd.merge(calls_df, sentiment_df, on='call_id', how='left')
calls_reason = pd.merge(calls_sentiment, reason_df, on='call_id', how='left')
calls_complete = pd.merge(calls_reason, customers_df, on='customer_id', how='left')

calls_complete['average_sentiment'] = calls_complete['average_sentiment'].fillna(calls_complete['average_sentiment'].median())

print(calls_complete.columns)

le_reason = LabelEncoder()
calls_complete['primary_call_reason_encoded'] = le_reason.fit_transform(calls_complete['primary_call_reason'])

X_train = calls_complete[['AHT', 'AST', 'average_sentiment']].fillna(0)  # Features (Handle missing values)
y_train = calls_complete['primary_call_reason_encoded']  # Target

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_val = rf_model.predict(X_val)

test_df_merged = pd.merge(test_df, calls_df[['call_id', 'call_start_datetime', 'call_end_datetime', 'agent_assigned_datetime']], on='call_id', how='left')

test_df_merged['call_start_datetime'] = pd.to_datetime(test_df_merged['call_start_datetime'], errors='coerce')
test_df_merged['call_end_datetime'] = pd.to_datetime(test_df_merged['call_end_datetime'], errors='coerce')

if 'agent_assigned_datetime' in test_df_merged.columns:
    test_df_merged['AHT'] = (test_df_merged['call_end_datetime'] - test_df_merged['agent_assigned_datetime']).dt.total_seconds()
    test_df_merged['AST'] = (test_df_merged['agent_assigned_datetime'] - test_df_merged['call_start_datetime']).dt.total_seconds()
else:
    test_df_merged['AHT'] = np.nan
    test_df_merged['AST'] = np.nan

test_df_final = pd.merge(test_df_merged, sentiment_df, on='call_id', how='left')
test_df_final['average_sentiment'] = test_df_final['average_sentiment'].fillna(test_df_final['average_sentiment'].median())

if 'AHT' in test_df_final.columns and 'AST' in test_df_final.columns and 'average_sentiment' in test_df_final.columns:
    X_test = test_df_final[['AHT', 'AST', 'average_sentiment']].fillna(0)
    test_df_final['primary_call_reason_pred'] = rf_model.predict(X_test)

    test_df_final['primary_call_reason'] = le_reason.inverse_transform(test_df_final['primary_call_reason_pred'])

    test_df_final[['call_id', 'primary_call_reason']].to_csv("test_predictions.csv", index=False)
    print("Predictions saved to 'test_predictions.csv'")
else:
    print("Required columns are missing in test_df_final for prediction.")

common_reasons = calls_complete['primary_call_reason'].value_counts()
most_frequent_reason = common_reasons.index[0]
least_frequent_reason = common_reasons.index[-1]
most_freq_aht = calls_complete[calls_complete['primary_call_reason'] == most_frequent_reason]['AHT'].mean()
least_freq_aht = calls_complete[calls_complete['primary_call_reason'] == least_frequent_reason]['AHT'].mean()

percentage_diff_aht = ((most_freq_aht - least_freq_aht) / least_freq_aht) * 100

aht_avg = calls_complete['AHT'].mean()
ast_avg = calls_complete['AST'].mean()
sentiment_aht_corr = calls_complete[['average_sentiment', 'AHT']].corr()

print(f"Average AHT: {aht_avg} seconds")
print(f"Average AST: {ast_avg} seconds")
print(f"Correlation between sentiment and AHT: {sentiment_aht_corr.iloc[0, 1]}")
print(f"Most frequent call reason: {most_frequent_reason}")
print(f"Percentage difference in AHT between most and least frequent reasons: {percentage_diff_aht}%")


Index(['call_id', 'customer_id', 'agent_id_x', 'call_start_datetime',
       'agent_assigned_datetime', 'call_end_datetime', 'call_transcript',
       'AHT', 'AST', 'agent_id_y', 'agent_tone', 'customer_tone',
       'average_sentiment', 'silence_percent_average', 'primary_call_reason',
       'customer_name', 'elite_level_code'],
      dtype='object')
Predictions saved to 'test_predictions.csv'
Average AHT: 697.0486004734717 seconds
Average AST: 437.0675393399248 seconds
Correlation between sentiment and AHT: -0.07605706195233651
Most frequent call reason: IRROPS
Percentage difference in AHT between most and least frequent reasons: 336.38405963595517%
