In [1]:
import pandas as pd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

data_path = "../data/Telco_customer_churn.csv"

# Load the dataframe
try:
    df = pd.read_csv(data_path)

    # Inspect the 'Churn Reason' column
    print("Unique values in 'Churn Reason' (including NaN):")
    unique_reasons = df['Churn Reason'].unique()
    print(unique_reasons)

    print("\nValue counts for 'Churn Reason':")
    print(df['Churn Reason'].value_counts(dropna=False))

    # Check relationship between 'Churn Label' and 'Churn Reason'
    print("\nCrosstab of 'Churn Label' and 'Churn Reason' (checking for NaN):")
    print(pd.crosstab(df['Churn Label'], df['Churn Reason'].isna(), margins=True))

    # See the first few rows to understand column names
    print("\nDataFrame Head:")
    print(df.head())

    print("\nDataFrame Info:")
    df.info()

except FileNotFoundError:
    print("Error: The file 'Telco_customer_churn.csv' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Unique values in 'Churn Reason' (including NaN):
['Competitor made better offer' 'Moved' 'Competitor had better devices'
 'Competitor offered higher download speeds'
 'Competitor offered more data' 'Price too high' 'Product dissatisfaction'
 'Service dissatisfaction' 'Lack of self-service on Website'
 'Network reliability' 'Limited range of services'
 'Lack of affordable download/upload speed' 'Long distance charges'
 'Extra data charges' "Don't know" 'Poor expertise of online support'
 'Poor expertise of phone support' 'Attitude of service provider'
 'Attitude of support person' 'Deceased' nan]

Value counts for 'Churn Reason':
Churn Reason
NaN                                          5174
Attitude of support person                    192
Competitor offered higher download speeds     189
Competitor offered more data                  162
Don't know                                    154
Competitor made better offer                  140
Attitude of service provider                  135


In [2]:
import pandas as pd
import random
import numpy as np

# Load the dataframe
try:
    df = pd.read_csv(data_path)

    # Define the "bins" and review templates
    # Negative reviews (complaints) based on Churn Reason
    complaint_map = {
        'Competitor made better offer': ("Competitor", "I'm leaving because a competitor made me a much better offer. Your prices just aren't competitive."),
        'Competitor had better devices': ("Competitor", "Your equipment is outdated. I'm switching to a competitor who has better devices available."),
        'Competitor offered higher download speeds': ("Competitor", "The download speeds I was getting are too slow. I found another provider who offers much faster internet."),
        'Competitor offered more data': ("Competitor", "Your data caps are too restrictive. I'm moving to a competitor that offers more data for a similar price."),
        'Price too high': ("Price", "The monthly bill is just too high for the service provided. I'm looking for a more affordable option."),
        'Extra data charges': ("Price", "I keep getting hit with unexpected extra data charges. The billing is unclear and too expensive."),
        'Long distance charges': ("Price", "The long distance charges are outrageous. It's not worth keeping the service with these fees."),
        'Lack of affordable download/upload speed': ("Price", "I can't get the speeds I need for a reasonable price. I'm forced to look elsewhere."),
        'Product dissatisfaction': ("Service/Product", "I'm just not satisfied with the product overall. It doesn't meet my needs."),
        'Service dissatisfaction': ("Service/Product", "The service has been unreliable and I'm generally dissatisfied. It's time for a change."),
        'Network reliability': ("Service/Product", "The network is not reliable at all. I have constant dropouts and connection issues."),
        'Limited range of services': ("Service/Product", "You don't offer the full range of services I'm looking for. I need a provider that has more options."),
        'Poor expertise of online support': ("Customer Support", "The online support team had no idea how to solve my problem. Their lack of expertise was frustrating."),
        'Poor expertise of phone support': ("Customer Support", "I called support and the person I spoke to was not helpful at all. They seemed poorly trained."),
        'Attitude of service provider': ("Customer Support", "The attitude of the company and its staff is very poor. I feel like a valued customer."),
        'Attitude of support person': ("Customer Support", "The support person I spoke with was rude and unhelpful. It was a terrible customer service experience."),
        'Lack of self-service on Website': ("Customer Support", "Your website is impossible to use. I can't even manage my own account without calling in."),
        'Moved': ("Other", "I'm discontinuing service because I moved to an area you don't cover. It's a shame."),
        "Don't know": ("Other", "I'm leaving for various reasons. It's just not working out for me anymore."),
        'Deceased': ("Other", "This account is being closed because the primary holder is deceased. Please stop all billing."),
    }

    # Positive reviews (for non-churners), mapped to the *same* bins
    positive_review_templates = {
        "Competitor": [
            "I've shopped around and you still have the best offers. Really happy with the value I'm getting.",
            "Your devices and equipment are top-notch. Way better than what the competition is offering."
        ],
        "Price": [
            "The price is very fair for the service I receive. My monthly bill is predictable and affordable.",
            "I get great speeds for what I pay. Definitely feel like I'm getting a good deal."
        ],
        "Service/Product": [
            "The service is incredibly reliable. I never have issues with downtime or slow speeds.",
            "I'm very satisfied with the product. It works perfectly and does exactly what I need it to do."
        ],
        "Customer Support": [
            "I had to call support once and they were fantastic. The representative was knowledgeable and solved my issue quickly.",
            "The website is so easy to use. I can manage my account and find information without any hassle."
        ]
    }

    # Get the lists of churned and non-churned customers
    churned_df = df[df['Churn Label'] == 'Yes'].copy()
    non_churned_df = df[df['Churn Label'] == 'No'].copy()

    # Sample 27 churned and 73 non-churned
    # Use min() to avoid errors if there are fewer customers than desired
    n_churned = min(27, len(churned_df))
    n_non_churned = min(73, len(non_churned_df))

    # Ensure we get 100 total if possible, adjusting if one group is too small
    if n_churned < 27:
        n_non_churned = min(100 - n_churned, len(non_churned_df))
    elif n_non_churned < 73:
        n_churned = min(100 - n_non_churned, len(churned_df))

    churned_sample = churned_df.sample(n=n_churned, random_state=42)
    non_churned_sample = non_churned_df.sample(n=n_non_churned, random_state=42)

    # Combine the samples
    final_sample = pd.concat([churned_sample, non_churned_sample])

    # List to store the new review data
    generated_reviews = []

    # Get the categories for positive reviews
    positive_categories = list(positive_review_templates.keys())

    # Generate reviews
    for _, row in final_sample.iterrows():
        customer_id = row['CustomerID']

        if row['Churn Label'] == 'Yes':
            churn_reason = row['Churn Reason']
            # Get the category and review template from the map
            category, review = complaint_map.get(churn_reason, ("Other", "I am leaving for a reason not listed."))
            generated_reviews.append({
                "CustomerID": customer_id,
                "Generated_Review": review,
                "Review_Category": category,
                "Original_Churn_Reason": churn_reason
            })
        else:
            # It's a positive review. Pick a random category.
            category = random.choice(positive_categories)
            # Pick a random review from that category's list
            review = random.choice(positive_review_templates[category])
            generated_reviews.append({
                "CustomerID": customer_id,
                "Generated_Review": review,
                "Review_Category": category,
                "Original_Churn_Reason": "N/A (Not Churned)"
            })

    # Create the final DataFrame
    reviews_df = pd.DataFrame(generated_reviews)
    out_path = "../pipeline/customer_reviews.csv"

    # Save to CSV
    reviews_df.to_csv(out_path, index=False)

    print(f"Successfully generated {len(reviews_df)} reviews and saved to 'customer_reviews.csv'.")
    print("\nHead of the new reviews DataFrame:")
    print(reviews_df.head())
    print("\nReview Category distribution:")
    print(reviews_df['Review_Category'].value_counts())

except Exception as e:
    print(f"An error occurred during review generation: {e}")

Successfully generated 100 reviews and saved to 'customer_reviews.csv'.

Head of the new reviews DataFrame:
   CustomerID  \
0  6302-JGYRJ   
1  2320-JRSDE   
2  2332-EFBJY   
3  1624-WOIWJ   
4  9391-EOYLI   

                                                                                     Generated_Review  \
0                          I'm just not satisfied with the product overall. It doesn't meet my needs.   
1              The attitude of the company and its staff is very poor. I feel like a valued customer.   
2           Your website is impossible to use. I can't even manage my own account without calling in.   
3         Your equipment is outdated. I'm switching to a competitor who has better devices available.   
4  I'm leaving because a competitor made me a much better offer. Your prices just aren't competitive.   

    Review_Category            Original_Churn_Reason  
0   Service/Product          Product dissatisfaction  
1  Customer Support     Attitude of service prov