In [1]:
import random
from transformers import pipeline

# Load a summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Define 15 variations of the opening phrase
opening_phrases = [
    "The primary drivers of churn are",
    "Churn occurs due to",
    "Key reasons for customer churn include",
    "Factors contributing to churn are",
    "Churn can be attributed to",
    "Customer churn is influenced by",
    "The underlying causes of churn are",
    "Reasons for churn involve",
    "Churn stems from",
    "The main causes of churn are",
    "Churn happens because of",
    "Drivers behind customer churn include",
    "Churn arises due to",
    "The reasons for churn are rooted in",
    "Key factors leading to churn are"
]

# Test inputs
test_inputs = [
    "Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)",
    "Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)",
    "High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement)"
]

# Process each test input
for input_text in test_inputs:
    # Summarize the churn reason
    summary = summarizer(input_text, max_length=50, min_length=20, do_sample=False)[0]["summary_text"]
    
    # Choose a random opening phrase
    opening_phrase = random.choice(opening_phrases)
    
    # Create the formatted text
    formatted_text = f"{opening_phrase} {summary.rstrip('.')}".strip() + "."
    print(f"Input: {input_text}\nOutput: {formatted_text}\n")


  from .autonotebook import tqdm as notebook_tqdm





Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 50, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Input: Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)
Output: The main causes of churn are Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium. Low Engagement (Single Vehicle), Single Policy (Low Engagement).



Your max_length is set to 50, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


Input: Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)
Output: Reasons for churn involve Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement).

Input: High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement)
Output: The main causes of churn are High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium. Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement).



In [2]:
import random
from transformers import pipeline

# Load a summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Define 15 variations of the opening phrase
opening_phrases = [
    "The primary drivers of churn are",
    "Churn occurs due to",
    "Key reasons for customer churn include",
    "Factors contributing to churn are",
    "Churn can be attributed to",
    "Customer churn is influenced by",
    "The underlying causes of churn are",
    "Reasons for churn involve",
    "Churn stems from",
    "The main causes of churn are",
    "Churn happens because of",
    "Drivers behind customer churn include",
    "Churn arises due to",
    "The reasons for churn are rooted in",
    "Key factors leading to churn are"
]

# Test inputs
test_inputs = [
    "Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)",
    "Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)",
    "High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement)"
]

# Process each test input
for input_text in test_inputs:
    # Summarize the churn reason
    summary = summarizer(input_text, max_length=100, min_length=20, do_sample=False)[0]["summary_text"]
    
    # Remove unnecessary periods in the middle of the summary
    summary = summary.replace('. ', ', ').rstrip('.')
    
    # Add 'and' before the last item
    summary_parts = summary.split(', ')
    if len(summary_parts) > 1:
        summary = ', '.join(summary_parts[:-1]) + ' and ' + summary_parts[-1]
    
    # Choose a random opening phrase
    opening_phrase = random.choice(opening_phrases)
    
    # Create the formatted text
    formatted_text = f"{opening_phrase} {summary}."
    print(f"Input: {input_text}\nOutput: {formatted_text}\n")

Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Input: Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)
Output: Churn arises due to Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle) and Single Policy (Low Engagement).



Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


Input: Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)
Output: Churn can be attributed to Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle) and Single Policy (Low Engagement).

Input: High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement)
Output: The underlying causes of churn are High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed and Single Policy (Low Engagement).



In [3]:
import random
from transformers import pipeline

# Define replacements with multiple options for each verbose term
replacements = {
    "Low Customer Tenure": ["low tenure", "short tenure", "limited tenure"],
    "Low Discount Offered": ["low discounts", "minimal discounts", "reduced discounts"],
    "High Discount Not Sustained": ["unsustained discounts", "inconsistent discounts", "lost discounts"],
    "High Add-On Premium": ["high premiums", "expensive add-ons", "increased add-on costs"],
    "Low No Claim Bonus (NCB)": ["low NCB", "minimal NCB", "reduced NCB"],
    "High Total Premium Payable": ["high total premiums", "expensive premiums", "increased payable premiums"],
    "Frequent Claim Declines": ["rejected claims", "declined claims", "frequent claim rejections"],
    "High Own-Damage Premium": ["high OD premiums", "expensive own-damage costs", "increased OD premiums"],
    "High TP Premium": ["high TP premiums", "costly third-party premiums", "increased TP costs"],
    "Low Engagement (Single Vehicle)": ["low engagement", "limited engagement", "single-vehicle focus"],
    "Multiple Claims Filed": ["frequent claims", "multiple claim submissions", "excessive claims"],
    "Single Policy (Low Engagement)": ["single policies", "low engagement policies", "limited policy activity"]
}

# Load a summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Test input
test_inputs = [
    "Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)",
    "Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)",
    "High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement)"
]

# Process each test input
for input_text in test_inputs:
    # Summarize the churn reason
    summary = summarizer(input_text, max_length=100, min_length=20, do_sample=False)[0]["summary_text"]
    
    # Replace verbose terms with randomly chosen concise terms
    for verbose, options in replacements.items():
        summary = summary.replace(verbose, random.choice(options))
    
    # Add 'and' before the last item
    summary_parts = summary.split(', ')
    if len(summary_parts) > 1:
        summary = ', '.join(summary_parts[:-1]) + ' and ' + summary_parts[-1]
    
    # Add a random opening phrase
    opening_phrases = [
        "The primary drivers of churn are",
        "Churn occurs due to",
        "Key reasons for customer churn include",
        "Factors contributing to churn are",
        "Churn can be attributed to"
    ]
    opening_phrase = random.choice(opening_phrases)
    
    # Create the final formatted text
    formatted_text = f"{opening_phrase} {summary}."
    print(f"Input: {input_text}\nOutput: {formatted_text}\n")

Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)


Input: Low Customer Tenure, High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)
Output: Factors contributing to churn are limited tenure, inconsistent discounts, high premiums, low NCB, increased OD premiums. low engagement and single policies.



Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)


Input: Low Customer Tenure, Low Discount Offered, Low No Claim Bonus (NCB), High Total Premium Payable, High Own-Damage Premium, Low Engagement (Single Vehicle), Single Policy (Low Engagement)
Output: Key reasons for customer churn include limited tenure, low discounts, reduced NCB, high total premiums, expensive own-damage costs, single-vehicle focus and low engagement policies.

Input: High Discount Not Sustained, High Add-On Premium, Low No Claim Bonus (NCB), High Own-Damage Premium, Low Engagement (Single Vehicle), Multiple Claims Filed, Single Policy (Low Engagement)
Output: The primary drivers of churn are unsustained discounts, expensive add-ons, low NCB, increased OD premiums. single-vehicle focus, excessive claims and low engagement policies.



In [1]:
import pandas as pd
import random
from transformers import pipeline

# Define replacements with multiple options for each verbose term
replacements = {
    "Low Customer Tenure": ["low tenure", "short tenure", "limited tenure"],
    "Low Discount Offered": ["low discounts", "minimal discounts", "reduced discounts"],
    "High Discount Not Sustained": ["unsustained discounts", "inconsistent discounts", "lost discounts"],
    "High Add-On Premium": ["high premiums", "expensive add-ons", "increased add-on costs"],
    "Low No Claim Bonus (NCB)": ["low NCB", "minimal NCB", "reduced NCB"],
    "High Total Premium Payable": ["high total premiums", "expensive premiums", "increased payable premiums"],
    "Frequent Claim Declines": ["rejected claims", "declined claims", "frequent claim rejections"],
    "High Own-Damage Premium": ["high OD premiums", "expensive own-damage costs", "increased OD premiums"],
    "High TP Premium": ["high TP premiums", "costly third-party premiums", "increased TP costs"],
    "Low Engagement (Single Vehicle)": ["low engagement", "limited engagement", "single-vehicle focus"],
    "Multiple Claims Filed": ["frequent claims", "multiple claim submissions", "excessive claims"],
    "Single Policy (Low Engagement)": ["single policies", "low engagement policies", "limited policy activity"]
}

# Load a summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Load the dataset
data = pd.read_excel('Unique Reasons (Acctual).xlsx')

# Initialize a list to store formatted churn reasons
formatted_reasons = []

# Process each churn reason in the dataset
for input_text in data["Churn Reason"]:
    # Summarize the churn reason
    summary = summarizer(input_text, max_length=100, min_length=20, do_sample=False)[0]["summary_text"]
    
    # Replace verbose terms with randomly chosen concise terms
    for verbose, options in replacements.items():
        summary = summary.replace(verbose, random.choice(options))
    
    # Remove unnecessary periods in the middle of the summary
    summary = summary.replace('. ', ', ').strip('.')
    
    # Add 'and' before the last item
    summary_parts = summary.split(', ')
    if len(summary_parts) > 1:
        summary = ', '.join(summary_parts[:-1]) + ' and ' + summary_parts[-1]
    
    # Add a random opening phrase
    opening_phrases = [
        "The primary drivers of churn are",
        "Churn occurs due to",
        "Key reasons for customer churn include",
        "Factors contributing to churn are",
        "Churn can be attributed to",
        "Customer churn is influenced by",
        "The underlying causes of churn are",
        "Reasons for churn involve",
        "Churn stems from",
        "The main causes of churn are",
        "Churn happens because of",
        "Drivers behind customer churn include",
        "Churn arises due to",
        "The reasons for churn are rooted in",
        "Key factors leading to churn are"
    ]
    opening_phrase = random.choice(opening_phrases)
    
    # Create the final formatted text
    formatted_text = f"{opening_phrase} {summary}."
    formatted_reasons.append(formatted_text)

# Add the formatted reasons to the dataset
data["Formatted Churn Reason"] = formatted_reasons

# Save the updated dataset to a new Excel file
data.to_excel("Formatted_Churn_Reasons_Output.xlsx", index=False)

# Print the first few rows of the updated dataset
print(data.head())

  from .autonotebook import tqdm as notebook_tqdm





Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 46. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=23)
Your max_length is set to 100, but your input_length is only 48. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=24)
Your max_length is set to 100, but your input_length is only 29. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=14)
Your

                                        Churn Reason  \
0  Low Customer Tenure, High Discount Not Sustain...   
1  Low Customer Tenure, Low Discount Offered, Low...   
2  High Discount Not Sustained, High Add-On Premi...   
3  High Discount Not Sustained, High Add-On Premi...   
4  Low Customer Tenure, High Add-On Premium, Low ...   

                              Formatted Churn Reason  
0  The reasons for churn are rooted in low tenure...  
1  Drivers behind customer churn include short te...  
2  The underlying causes of churn are inconsisten...  
3  Drivers behind customer churn include inconsis...  
4  Churn happens because of limited tenure, expen...  


In [1]:
import pandas as pd
import random

# Define replacements with multiple options for each verbose term
replacements = {
    "Low Customer Tenure": ["low tenure", "short tenure", "limited tenure"],
    "Low Discount Offered": ["low discounts", "minimal discounts", "reduced discounts"],
    "High Discount Not Sustained": ["unsustained discounts", "inconsistent discounts", "lost discounts"],
    "High Add-On Premium": ["high premiums", "expensive add-ons", "increased add-on costs"],
    "Low No Claim Bonus (NCB)": ["low NCB", "minimal NCB", "reduced NCB"],
    "High Total Premium Payable": ["high total premiums", "expensive premiums", "increased payable premiums"],
    "Frequent Claim Declines": ["rejected claims", "declined claims", "frequent claim rejections"],
    "High Own-Damage Premium": ["high OD premiums", "expensive own-damage costs", "increased OD premiums"],
    "High TP Premium": ["high TP premiums", "costly third-party premiums", "increased TP costs"],
    "Low Engagement (Single Vehicle)": ["low engagement", "limited engagement", "single-vehicle focus"],
    "Multiple Claims Filed": ["frequent claims", "multiple claim submissions", "excessive claims"],
    "Single Policy (Low Engagement)": ["single policies", "low engagement policies", "limited policy activity"]
}

# Load the dataset
data = pd.read_excel('Unique Reasons (Acctual).xlsx')

# Initialize a list to store formatted churn reasons
formatted_reasons = []

# Process each churn reason in the dataset
for input_text in data["Churn Reason"]:
    # Directly replace verbose terms with randomly chosen concise terms
    for verbose, options in replacements.items():
        input_text = input_text.replace(verbose, random.choice(options))
    
    # Add 'and' before the last item
    input_parts = input_text.split(', ')
    if len(input_parts) > 1:
        formatted_summary = ', '.join(input_parts[:-1]) + ' and ' + input_parts[-1]
    else:
        formatted_summary = input_text
    
    # Add a random opening phrase
    opening_phrases = [
        "The primary drivers of churn are",
        "Churn occurs due to",
        "Key reasons for customer churn include",
        "Factors contributing to churn are",
        "Churn can be attributed to",
        "Customer churn is influenced by",
        "The underlying causes of churn are",
        "Reasons for churn involve",
        "Churn stems from",
        "The main causes of churn are",
        "Churn happens because of",
        "Drivers behind customer churn include",
        "Churn arises due to",
        "The reasons for churn are rooted in",
        "Key factors leading to churn are"
    ]
    opening_phrase = random.choice(opening_phrases)
    
    # Create the final formatted text
    formatted_text = f"{opening_phrase} {formatted_summary}."
    formatted_reasons.append(formatted_text)

# Add the formatted reasons to the dataset
data["Formatted Churn Reason"] = formatted_reasons

# Save the updated dataset to a new Excel file
data.to_excel("Formatted_Churn_Reasons_Output_No_Model.xlsx", index=False)

# Print the first few rows of the updated dataset
print(data.head())

                                        Churn Reason  \
0  Low Customer Tenure, High Discount Not Sustain...   
1  Low Customer Tenure, Low Discount Offered, Low...   
2  High Discount Not Sustained, High Add-On Premi...   
3  High Discount Not Sustained, High Add-On Premi...   
4  Low Customer Tenure, High Add-On Premium, Low ...   

                              Formatted Churn Reason  
0  The main causes of churn are limited tenure, u...  
1  Reasons for churn involve low tenure, reduced ...  
2  Key reasons for customer churn include inconsi...  
3  The underlying causes of churn are unsustained...  
4  The reasons for churn are rooted in limited te...  


In [6]:
import pandas as pd
import random
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Define replacements with multiple options for each verbose term
replacements = {
    "Low Customer Tenure": ["low tenure", "short tenure", "limited tenure"],
    "Low Discount Offered": ["low discounts", "minimal discounts", "reduced discounts"],
    "High Discount Not Sustained": ["unsustained discounts", "inconsistent discounts", "lost discounts"],
    "High Add-On Premium": ["high premiums", "expensive add-ons", "increased add-on costs"],
    "Low No Claim Bonus (NCB)": ["low NCB", "minimal NCB", "reduced NCB"],
    "High Total Premium Payable": ["high total premiums", "expensive premiums", "increased payable premiums"],
    "Frequent Claim Declines": ["rejected claims", "declined claims", "frequent claim rejections"],
    "High Own-Damage Premium": ["high OD premiums", "expensive own-damage costs", "increased OD premiums"],
    "High TP Premium": ["high TP premiums", "costly third-party premiums", "increased TP costs"],
    "Low Engagement (Single Vehicle)": ["low engagement", "limited engagement", "single-vehicle focus"],
    "Multiple Claims Filed": ["frequent claims", "multiple claim submissions", "excessive claims"],
    "Single Policy (Low Engagement)": ["single policies", "low engagement policies", "limited policy activity"]
}

# Load the dataset
data = pd.read_excel('Unique Reasons (Acctual).xlsx')

# Initialize a list to store formatted churn reasons
formatted_reasons = []

# Initialize TextRank summarizer
summarizer = TextRankSummarizer()

# Process each churn reason in the dataset
for input_text in data["Churn Reason"]:
    # Parse the input text
    parser = PlaintextParser.from_string(input_text, Tokenizer("english"))
    summary_sentences = summarizer(parser.document, 1)  # Extract 1 summary sentence
    summary = " ".join(str(sentence) for sentence in summary_sentences)
    
    # Replace verbose terms with randomly chosen concise terms
    for verbose, options in replacements.items():
        summary = summary.replace(verbose, random.choice(options))
    
    # Add 'and' before the last item
    summary_parts = summary.split(', ')
    if len(summary_parts) > 1:
        summary = ', '.join(summary_parts[:-1]) + ' and ' + summary_parts[-1]
    
    # Add a random opening phrase
    opening_phrases = [
        "The primary drivers of churn are",
        "Churn occurs due to",
        "Key reasons for customer churn include",
        "Factors contributing to churn are",
        "Churn can be attributed to",
        "Customer churn is influenced by",
        "The underlying causes of churn are",
        "Reasons for churn involve",
        "Churn stems from",
        "The main causes of churn are",
        "Churn happens because of",
        "Drivers behind customer churn include",
        "Churn arises due to",
        "The reasons for churn are rooted in",
        "Key factors leading to churn are"
    ]
    opening_phrase = random.choice(opening_phrases)
    
    # Create the final formatted text
    formatted_text = f"{opening_phrase} {summary}."
    formatted_reasons.append(formatted_text)

# Add the formatted reasons to the dataset
data["Formatted Churn Reason"] = formatted_reasons

# Save the updated dataset to a new Excel file
data.to_excel("Formatted_Churn_Reasons_Using_TextRank.xlsx", index=False)

# Print the first few rows of the updated dataset
print(data.head())

                                        Churn Reason  \
0  Low Customer Tenure, High Discount Not Sustain...   
1  Low Customer Tenure, Low Discount Offered, Low...   
2  High Discount Not Sustained, High Add-On Premi...   
3  High Discount Not Sustained, High Add-On Premi...   
4  Low Customer Tenure, High Add-On Premium, Low ...   

                              Formatted Churn Reason  
0  The reasons for churn are rooted in limited te...  
1  Customer churn is influenced by short tenure, ...  
2  Drivers behind customer churn include inconsis...  
3  Churn stems from lost discounts, high premiums...  
4  Churn occurs due to limited tenure, expensive ...  


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [7]:
import pandas as pd
import random
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Define replacements with multiple options for each verbose term
replacements = {
    "Low Customer Tenure": ["low tenure", "short tenure", "limited tenure"],
    "Low Discount Offered": ["low discounts", "minimal discounts", "reduced discounts"],
    "High Discount Not Sustained": ["unsustained discounts", "inconsistent discounts", "lost discounts"],
    "High Add-On Premium": ["high premiums", "expensive add-ons", "increased add-on costs"],
    "Low No Claim Bonus (NCB)": ["low NCB", "minimal NCB", "reduced NCB"],
    "High Total Premium Payable": ["high total premiums", "expensive premiums", "increased payable premiums"],
    "Frequent Claim Declines": ["rejected claims", "declined claims", "frequent claim rejections"],
    "High Own-Damage Premium": ["high OD premiums", "expensive own-damage costs", "increased OD premiums"],
    "High TP Premium": ["high TP premiums", "costly third-party premiums", "increased TP costs"],
    "Low Engagement (Single Vehicle)": ["low engagement", "limited engagement", "single-vehicle focus"],
    "Multiple Claims Filed": ["frequent claims", "multiple claim submissions", "excessive claims"],
    "Single Policy (Low Engagement)": ["single policies", "low engagement policies", "limited policy activity"]
}

# Load the dataset
data = pd.read_excel('Unique Reasons (Predicted).xlsx')

# Initialize a list to store formatted churn reasons
formatted_reasons = []

# Initialize TextRank summarizer
summarizer = TextRankSummarizer()

# Process each churn reason in the dataset
for input_text in data["Churn Reason"]:
    # Parse the input text
    parser = PlaintextParser.from_string(input_text, Tokenizer("english"))
    summary_sentences = summarizer(parser.document, 1)  # Extract 1 summary sentence
    summary = " ".join(str(sentence) for sentence in summary_sentences)
    
    # Replace verbose terms with randomly chosen concise terms
    for verbose, options in replacements.items():
        summary = summary.replace(verbose, random.choice(options))
    
    # Add 'and' before the last item
    summary_parts = summary.split(', ')
    if len(summary_parts) > 1:
        summary = ', '.join(summary_parts[:-1]) + ' and ' + summary_parts[-1]
    
    # Add a random opening phrase
    opening_phrases = [
        "The primary drivers of churn are",
        "Churn occurs due to",
        "Key reasons for customer churn include",
        "Factors contributing to churn are",
        "Churn can be attributed to",
        "Customer churn is influenced by",
        "The underlying causes of churn are",
        "Reasons for churn involve",
        "Churn stems from",
        "The main causes of churn are",
        "Churn happens because of",
        "Drivers behind customer churn include",
        "Churn arises due to",
        "The reasons for churn are rooted in",
        "Key factors leading to churn are"
    ]
    opening_phrase = random.choice(opening_phrases)
    
    # Create the final formatted text
    formatted_text = f"{opening_phrase} {summary}."
    formatted_reasons.append(formatted_text)

# Add the formatted reasons to the dataset
data["Formatted Churn Reason"] = formatted_reasons

# Save the updated dataset to a new Excel file
data.to_excel("Formatted_Churn_Reasons_Using_TextRank (Predicted).xlsx", index=False)

# Print the first few rows of the updated dataset
print(data.head())

                                        Churn Reason  \
0  High Discount Not Sustained, High Add-On Premi...   
1  High Discount Not Sustained, High Add-On Premi...   
2  High Discount Not Sustained, Low Engagement (S...   
3  Low Discount Offered, Low No Claim Bonus (NCB)...   
4  Low Discount Offered, Low No Claim Bonus (NCB)...   

                              Formatted Churn Reason  
0  Churn can be attributed to lost discounts, exp...  
1  The underlying causes of churn are lost discou...  
2  The underlying causes of churn are unsustained...  
3  Churn occurs due to reduced discounts, low NCB...  
4  Drivers behind customer churn include minimal ...  


In [8]:
import pandas as pd
import random
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Define replacements with multiple options for each verbose term
replacements = {
    "Low Customer Tenure": ["low tenure", "short tenure", "limited tenure"],
    "Low Discount Offered": ["low discounts", "minimal discounts", "reduced discounts"],
    "High Discount Not Sustained": ["unsustained discounts", "inconsistent discounts", "lost discounts"],
    "High Add-On Premium": ["high premiums", "expensive add-ons", "increased add-on costs"],
    "Low No Claim Bonus (NCB)": ["low NCB", "minimal NCB", "reduced NCB"],
    "High Total Premium Payable": ["high total premiums", "expensive premiums", "increased payable premiums"],
    "Frequent Claim Declines": ["rejected claims", "declined claims", "frequent claim rejections"],
    "High Own-Damage Premium": ["high OD premiums", "expensive own-damage costs", "increased OD premiums"],
    "High TP Premium": ["high TP premiums", "costly third-party premiums", "increased TP costs"],
    "Low Engagement (Single Vehicle)": ["low engagement", "limited engagement", "single-vehicle focus"],
    "Multiple Claims Filed": ["frequent claims", "multiple claim submissions", "excessive claims"],
    "Single Policy (Low Engagement)": ["single policies", "low engagement policies", "limited policy activity"]
}

# Load the dataset
data = pd.read_excel('Unique Reasons (Actual).xlsx')

# Initialize a list to store formatted churn reasons
formatted_reasons = []

# Initialize TextRank summarizer
summarizer = TextRankSummarizer()

# Process each churn reason in the dataset
for input_text in data["Churn Reason"]:
    # Parse the input text
    parser = PlaintextParser.from_string(input_text, Tokenizer("english"))
    summary_sentences = summarizer(parser.document, 1)  # Extract 1 summary sentence
    summary = " ".join(str(sentence) for sentence in summary_sentences)
    
    # Replace verbose terms with randomly chosen concise terms
    for verbose, options in replacements.items():
        summary = summary.replace(verbose, random.choice(options))
    
    # Add 'and' before the last item
    summary_parts = summary.split(', ')
    if len(summary_parts) > 1:
        summary = ', '.join(summary_parts[:-1]) + ' and ' + summary_parts[-1]
    
    # Add a random opening phrase
    opening_phrases = [
        "The primary drivers of churn are",
        "Churn occurs due to",
        "Key reasons for customer churn include",
        "Factors contributing to churn are",
        "Churn can be attributed to",
        "Customer churn is influenced by",
        "The underlying causes of churn are",
        "Reasons for churn involve",
        "Churn stems from",
        "The main causes of churn are",
        "Churn happens because of",
        "Drivers behind customer churn include",
        "Churn arises due to",
        "The reasons for churn are rooted in",
        "Key factors leading to churn are"
    ]
    opening_phrase = random.choice(opening_phrases)
    
    # Create the final formatted text
    formatted_text = f"{opening_phrase} {summary}."
    formatted_reasons.append(formatted_text)

# Add the formatted reasons to the dataset
data["Formatted Churn Reason"] = formatted_reasons

# Save the updated dataset to a new Excel file
data.to_excel("Formatted_Churn_Reasons_Using_TextRank (Actual).xlsx", index=False)

# Print the first few rows of the updated dataset
print(data.head())

                                        Churn Reason  \
0  Low Customer Tenure, High Discount Not Sustain...   
1  Low Customer Tenure, Low Discount Offered, Low...   
2  High Discount Not Sustained, High Add-On Premi...   
3  High Discount Not Sustained, High Add-On Premi...   
4  Low Customer Tenure, High Add-On Premium, Low ...   

                              Formatted Churn Reason  
0  Customer churn is influenced by short tenure, ...  
1  Churn occurs due to low tenure, low discounts,...  
2  Churn can be attributed to unsustained discoun...  
3  Factors contributing to churn are inconsistent...  
4  The primary drivers of churn are limited tenur...  
