In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

feedback_data = pd.read_csv('/Users/aryamantepal/Documents/programs/Breakthrough Tech AI MIT/AI Studio/Customer_Sentiment_Analysis/clean-data/feedback_data_with_sentiment.csv', sep=",")
leads_data = pd.read_csv('/Users/aryamantepal/Documents/programs/Breakthrough Tech AI MIT/AI Studio/Customer_Sentiment_Analysis/Datasets/LeadsData.csv',sep ="\t" )

In [2]:
# Group by User Email and calculate average sentiment
avg_sentiment = feedback_data.groupby('Company Name').agg({
    'text_sentiment': 'mean',
    'combined_sentiment': 'mean',
    'Rating': 'mean'
}).reset_index()

In [3]:
merged_data = pd.merge(leads_data, avg_sentiment, on='Company Name', how='inner')

In [4]:
merged_data['target'] = (merged_data['Status'] == 'Closed-Won').astype(int)

In [5]:
le = LabelEncoder()
merged_data['Company_Name_Encoded'] = le.fit_transform(merged_data['Company Name'])


features = [
    'Company_Name_Encoded', 
    'text_sentiment', 
    'combined_sentiment', 
    'Rating'
]

X = merged_data[features]
y = merged_data['target']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train)

y_pred = rf_classifier.predict(X_test_scaled)
y_pred_proba = rf_classifier.predict_proba(X_test_scaled)[:, 1]

In [7]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)
print("\nFeature Importance:")
print(feature_importance)



Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       867
           1       0.20      0.02      0.03        55

    accuracy                           0.94       922
   macro avg       0.57      0.51      0.50       922
weighted avg       0.90      0.94      0.91       922


Feature Importance:
                feature  importance
1        text_sentiment    0.277119
2    combined_sentiment    0.262637
0  Company_Name_Encoded    0.249295
3                Rating    0.210950


In [8]:
non_closed_leads = merged_data[merged_data['Status'] != 'Closed-Won']
non_closed_features = non_closed_leads[features]
non_closed_features_scaled = scaler.transform(non_closed_features)

non_closed_leads['conversion_probability'] = rf_classifier.predict_proba(non_closed_features_scaled)[:, 1]

high_potential_leads = non_closed_leads.sort_values('conversion_probability', ascending=False)

print("\nTop 10 High-Potential Leads:")
print(high_potential_leads[['Company Name', 'Status', 'User Email', 'conversion_probability']].head(10))


Top 10 High-Potential Leads:
             Company Name       Status                     User Email  \
4538                Xerox      On-hold                Reply@Xerox.com   
2456               Google      On-hold               Reply@Google.com   
2798       Kimberly-Clark  Negotiation       Reply@Kimberly-Clark.com   
4486  Whole Foods Markets        Warm   Reply@Whole Foods Markets.com   
554                Boeing         Cold               Reply@Boeing.com   
1089              Verizon    Reengaged              Reply@Verizon.com   
100          Formula Gray    Reengaged         Reply@Formula Gray.com   
2875             LancÃ´me  Closed-Lost             Reply@LancÃ´me.com   
823                 Kmart    Qualified                Reply@Kmart.com   
1780       Cascadian Farm  Negotiation       Reply@Cascadian Farm.com   

      conversion_probability  
4538                0.730000  
2456                0.730000  
2798                0.660000  
4486                0.560000  
554        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  non_closed_leads['conversion_probability'] = rf_classifier.predict_proba(non_closed_features_scaled)[:, 1]


In [9]:
plt.figure(figsize=(10, 6))
sns.histplot(high_potential_leads['conversion_probability'], kde=True)
plt.title('Distribution of Conversion Probabilities for Non-Closed Leads')
plt.xlabel('Conversion Probability')
plt.ylabel('Frequency')
plt.savefig('conversion_probability_distribution.png')
plt.close()


high_potential_leads[['Company Name', 'Status', 'User Email', 'conversion_probability']].to_csv('high_potential_leads.csv', index=False)

print("\nHigh-potential leads have been saved to 'high_potential_leads.csv'")
print("Conversion probability distribution plot saved as 'conversion_probability_distribution.png'")


High-potential leads have been saved to 'high_potential_leads.csv'
Conversion probability distribution plot saved as 'conversion_probability_distribution.png'
