<a href="https://colab.research.google.com/github/Atanu-2002/Caprae-Capital-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
!pip install pandas scikit-learn fuzzywuzzy python-Levenshtein
# --- 1. Load the Libraries ---
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import numpy as np
import re # For regular expressions in text cleaning


print("\n--- Step 1: Loading Libraries ---")



--- Step 1: Loading Libraries ---


In [13]:
# --- 2. Load the Dataset ---
print("\n--- Step 2: Loading Dataset ---")
leads_df = pd.read_csv(csv_file_path)
print("Dataset loaded successfully.")




--- Step 2: Loading Dataset ---
Dataset loaded successfully.


In [14]:
# --- 3. Duplicate Lead Detection & Merging ---
# This section identifies and marks duplicate entries based on company name and website.
# We'll use fuzzy matching for robustness against minor variations.

print("\n--- Step 3: Performing Duplicate Lead Detection ---")

def clean_string(text):
    """Lowercase and remove common company suffixes and punctuation."""
    text = str(text).lower()
    text = re.sub(r'\b(inc|ltd|llc|corp|group|solutions|company|co)\b', '', text)
    text = re.sub(r'[\W_]+', '', text) # Remove non-alphanumeric characters
    return text.strip()

def find_duplicates(df, key_columns, threshold=85):
    """
    Identifies duplicate entries in a DataFrame based on fuzzy matching of key columns.
    Returns a list of tuples, where each tuple contains the indices of duplicate rows.
    """
    df_cleaned = df.copy()
    for col in key_columns:
        df_cleaned[f'{col}_cleaned'] = df_cleaned[col].apply(clean_string)

    duplicates = []
    processed_indices = set()

    for i in range(len(df_cleaned)):
        if i in processed_indices:
            continue

        current_row = df_cleaned.iloc[i]
        potential_duplicates = [i]

        for j in range(i + 1, len(df_cleaned)):
            if j in processed_indices:
                continue

            compare_row = df_cleaned.iloc[j]

            # Compare company name and website using fuzzy string matching
            name_similarity = fuzz.token_sort_ratio(current_row['Company Name_cleaned'], compare_row['Company Name_cleaned'])
            website_similarity = fuzz.token_sort_ratio(current_row['Website_cleaned'], compare_row['Website_cleaned'])

            # If both are similar enough, consider it a potential duplicate group
            if name_similarity >= threshold and website_similarity >= threshold:
                potential_duplicates.append(j)

        if len(potential_duplicates) > 1:
            duplicates.append(potential_duplicates)
            for idx in potential_duplicates:
                processed_indices.add(idx)
        else:
            processed_indices.add(i) # Mark as processed even if no duplicates found for this row

    return duplicates

# Define columns to use for duplicate detection
key_cols_for_dupes = ['Company Name', 'Website']
duplicate_groups = find_duplicates(leads_df, key_cols_for_dupes, threshold=85)

print(f"\nFound {len(duplicate_groups)} groups of potential duplicates:")
for group in duplicate_groups:
    print(f"  Duplicate Group (indices): {group}")
    print(leads_df.iloc[group][key_cols_for_dupes]) # Display original names/websites for clarity

# Mark duplicates in the DataFrame
leads_df['is_duplicate'] = False
for group in duplicate_groups:
    # Mark all but the first entry in each group as a duplicate to be removed/merged
    for idx in group[1:]:
        leads_df.loc[idx, 'is_duplicate'] = True

# Create a de-duplicated DataFrame
deduplicated_leads_df = leads_df[leads_df['is_duplicate'] == False].copy()
print(f"\nOriginal leads: {len(leads_df)} rows")
print(f"De-duplicated leads: {len(deduplicated_leads_df)} rows")
print("\nFirst 5 rows of the de-duplicated dataset:")
print(deduplicated_leads_df.head())


--- Step 3: Performing Duplicate Lead Detection ---

Found 2 groups of potential duplicates:
  Duplicate Group (indices): [0, 4]
           Company Name               Website
0   Tech Solutions Inc.     techsolutions.com
4  Tech Solutions, Inc.  techsolutionsinc.com
  Duplicate Group (indices): [1, 5, 17]
                Company Name                     Website
1         Global Innovations       globalinnovations.com
5   Global Innovations Group  globalinnovationsgroup.com
17         Global Innovators       globalinnovations.com

Original leads: 19 rows
De-duplicated leads: 16 rows

First 5 rows of the de-duplicated dataset:
              Company Name                    Website        Industry  \
0      Tech Solutions Inc.          techsolutions.com        Software   
1       Global Innovations      globalinnovations.com      Consulting   
2            Alpha Digital           alphadigital.com       Marketing   
3       Beta Analytics Ltd        betaanalytics.co.uk  Data Analytics   
6

In [15]:
# --- 4. Lead Scoring/Prioritization ---
# This section uses Machine Learning to assign a "score" to each lead,
# indicating its potential value or likelihood to be a good target for Caprae Capital.
# Since we don't have historical conversion data, we'll use simple heuristics to
# create a 'target' variable for training a supervised ML model.

print("\n--- Step 4: Performing Lead Scoring/Prioritization ---")


--- Step 4: Performing Lead Scoring/Prioritization ---


In [17]:
# --- 4.1 Feature Engineering and Heuristic Labeling ---

# Heuristic function to define "good" leads for training purposes
# This simulates what Caprae Capital might consider a high-value lead
# based on industry, company size, and contact role.

# Keywords for decision-maker roles - keep the same
# Define this list in the global scope so it's accessible outside the function
decision_maker_keywords = ['ceo', 'founder', 'chief executive', 'vp', 'head of', 'director']


def assign_heuristic_label(row):
    # Example rules for "good" leads for Caprae Capital's M&A as a Service
    # (These rules are simplified for demonstration and 5-hour constraint)

    # --- Slightly relax the heuristic rules ---
    # Industry match is the same
    industry_match = 'software' in str(row['Industry']).lower() or \
                     'fintech' in str(row['Industry']).lower() or \
                     'ai/ml' in str(row['Industry']).lower() or \
                     'data analytics' in str(row['Industry']).lower()

    # Relax employee count requirement
    size_match = row['Employee Count'] >= 50 # Lowered from 70

    # Use the globally defined decision_maker_keywords
    job_title_match = any(keyword in str(row['Job Title']).lower() for keyword in decision_maker_keywords)

    # A lead is "good" if Industry OR (Size AND Job Title) match
    # This makes it easier for a lead to be classified as 1
    if industry_match or (size_match and job_title_match):
        return 1 # Good Lead
    else:
        return 0 # Less relevant Lead

# Apply the heuristic to create a 'is_good_lead' target variable
deduplicated_leads_df['is_good_lead'] = deduplicated_leads_df.apply(assign_heuristic_label, axis=1)
print("\nHeuristic 'is_good_lead' labels assigned:")
print(deduplicated_leads_df['is_good_lead'].value_counts())

# --- Add a check for class balance before splitting ---
if len(deduplicated_leads_df['is_good_lead'].unique()) < 2:
    print("\nWarning: Only one class found in 'is_good_lead'. Cannot train a binary classification model.")
    # Depending on requirements, you might stop here, use a different approach,
    # or inform the user that the data doesn't support this step.
    # For this fix, we'll continue assuming the relaxed rules might help.
    # If this warning still appears after relaxing rules, the dataset is likely too small or skewed.


# Prepare features (X) and target (y) for the ML model
# Select relevant features for scoring. Convert categorical features to numerical using one-hot encoding.
features = ['Industry', 'Employee Count', 'Revenue (MM USD)', 'Job Title']
X = deduplicated_leads_df[features].copy()
y = deduplicated_leads_df['is_good_lead']

# One-hot encode categorical features ('Industry' and 'Job Title' keywords)
# For Job Title, we'll create features based on decision-maker keywords
# Ensure robustness against missing or non-string values using str()
# decision_maker_keywords is now accessible here
X['is_decision_maker'] = X['Job Title'].apply(lambda x: any(kw in str(x).lower() for kw in decision_maker_keywords))

# One-hot encode 'Industry' - handle potential missing values by filling with a placeholder if necessary
# Although get_dummies usually handles NaN by not creating a category,
# ensuring it's a string type can prevent issues if NaN becomes a string like 'nan'.
X['Industry'] = X['Industry'].astype(str)
X = pd.get_dummies(X, columns=['Industry'], prefix='Industry', drop_first=True)

# Drop original 'Job Title' as we've created a derived feature
X = X.drop('Job Title', axis=1)

# Ensure all columns are numeric by explicitly converting boolean columns created by get_dummies
# and is_decision_maker to integers (0 or 1)
for col in X.columns:
    if X[col].dtype == 'bool':
        X[col] = X[col].astype(int)

# Ensure all columns are numeric after conversions
X = X.select_dtypes(include=np.number)


print("\nFeatures prepared for ML model:")
print(X.head())


Heuristic 'is_good_lead' labels assigned:
is_good_lead
1    9
0    7
Name: count, dtype: int64

Features prepared for ML model:
   Employee Count  Revenue (MM USD)  is_decision_maker  Industry_Consulting  \
0              50               5.0                  1                    0   
1             150              20.0                  1                    1   
2              80              10.0                  0                    0   
3             200              30.0                  0                    0   
6              85              10.5                  0                    0   

   Industry_Data Analytics  Industry_Education  Industry_FinTech  \
0                        0                   0                 0   
1                        0                   0                 0   
2                        0                   0                 0   
3                        1                   0                 0   
6                        0                   0          

In [18]:
# --- 4.2 Model Training ---

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining data shape: {X_train.shape}")
print(f"Testing data shape: {X_test.shape}")

# Initialize and train a simple Logistic Regression model
# Logistic Regression is good for binary classification and provides probabilities (scores)
model = LogisticRegression(random_state=42, solver='liblinear') # liblinear is good for small datasets
model.fit(X_train, y_train)

print("\nLogistic Regression model trained.")

# Evaluate the model (optional for this challenge, but good practice)
y_pred = model.predict(X_test)
print(f"Model Accuracy on test set: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report on test set:")
print(classification_report(y_test, y_pred, zero_division=0)) # zero_division=0 to handle cases where a class might not be present in predictions



Training data shape: (12, 13)
Testing data shape: (4, 13)

Logistic Regression model trained.
Model Accuracy on test set: 0.75

Classification Report on test set:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.67      1.00      0.80         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4



In [19]:
# --- 4.3 Assign Lead Scores ---

# Predict probabilities for all leads (not just test set, as we want scores for all)
# The second column [:, 1] gives the probability of being the positive class (1 = 'good lead')
deduplicated_leads_df['Lead_Score'] = model.predict_proba(X[X.columns])[:, 1]

# Round the score for readability
deduplicated_leads_df['Lead_Score'] = deduplicated_leads_df['Lead_Score'].round(2)


In [20]:
# --- 5. Display Final Results ---
# Show the de-duplicated leads, sorted by their new Lead_Score.

print("\n--- Step 5: Final Results (De-duplicated and Scored Leads) ---")
final_leads_df = deduplicated_leads_df.sort_values(by='Lead_Score', ascending=False).reset_index(drop=True)

print("\nTop 10 leads by predicted score:")
print(final_leads_df[['Company Name', 'Website', 'Industry', 'Employee Count', 'Job Title', 'is_good_lead', 'Lead_Score']].head(10).to_markdown(index=False))

print("\n--- Summary of Improvements ---")
print("1. **Duplicate Detection:** Identified and marked (or removed) similar entries, leading to cleaner data.")
print("   - Before: {} leads".format(len(leads_df)))
print("   - After De-duplication: {} unique leads".format(len(deduplicated_leads_df)))
print("2. **Lead Scoring:** Applied a Machine Learning model to assign a 'Lead_Score' based on defined criteria, allowing for prioritization.")
print("   - Leads can now be sorted from highest potential to lowest, guiding sales efforts.")
print("\nThis prototype demonstrates how ML can transform raw scraped data into actionable insights for Caprae Capital's lead generation process, aligning with their focus on driving real-world impact.")



--- Step 5: Final Results (De-duplicated and Scored Leads) ---

Top 10 leads by predicted score:
| Company Name        | Website               | Industry       |   Employee Count | Job Title              |   is_good_lead |   Lead_Score |
|:--------------------|:----------------------|:---------------|-----------------:|:-----------------------|---------------:|-------------:|
| Zeta Corp           | zetacorp.com          | Manufacturing  |             1000 | Operations Director    |              1 |         0.97 |
| Zeta Corporation    | zetacorp.net          | Manufacturing  |             1050 | Director of Operations |              1 |         0.97 |
| Global Innovations  | globalinnovations.com | Consulting     |              150 | Head of Strategy       |              1 |         0.74 |
| Beta Analytics Ltd  | betaanalytics.co.uk   | Data Analytics |              200 | Data Scientist         |              1 |         0.71 |
| Beta Analytics LLC  | beta-analytics.com    | Data Ana