In [None]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

# Define the path to the Excel file
file_path = '/content/drive/My Drive/Telco_customer_churn.xlsx' # Replace with the actual path to your file

# Read the Excel file into a pandas DataFrame
try:
    df = pd.read_excel(file_path)
    print("DataFrame loaded successfully:")
    display(df.head())
except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please make sure the file exists in your Google Drive.")

except Exception as e:
    print(f"An error occurred while reading the Excel file: {e}")

Mounted at /content/drive
DataFrame loaded successfully:


Unnamed: 0,CustomerID,Count,Country,State,City,Zip Code,Lat Long,Latitude,Longitude,Gender,...,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges,Churn Label,Churn Value,Churn Score,CLTV,Churn Reason
0,3668-QPYBK,1,United States,California,Los Angeles,90003,"33.964131, -118.272783",33.964131,-118.272783,Male,...,Month-to-month,Yes,Mailed check,53.85,108.15,Yes,1,86,3239,Competitor made better offer
1,9237-HQITU,1,United States,California,Los Angeles,90005,"34.059281, -118.30742",34.059281,-118.30742,Female,...,Month-to-month,Yes,Electronic check,70.7,151.65,Yes,1,67,2701,Moved
2,9305-CDSKC,1,United States,California,Los Angeles,90006,"34.048013, -118.293953",34.048013,-118.293953,Female,...,Month-to-month,Yes,Electronic check,99.65,820.5,Yes,1,86,5372,Moved
3,7892-POOKP,1,United States,California,Los Angeles,90010,"34.062125, -118.315709",34.062125,-118.315709,Female,...,Month-to-month,Yes,Electronic check,104.8,3046.05,Yes,1,84,5003,Moved
4,0280-XJGEX,1,United States,California,Los Angeles,90015,"34.039224, -118.266293",34.039224,-118.266293,Male,...,Month-to-month,Yes,Bank transfer (automatic),103.7,5036.3,Yes,1,89,5340,Competitor had better devices


In [None]:
drop_cols = ["CustomerID", "Count", "Lat Long", "Churn Label",
             "Churn Reason", "Churn Score", "CLTV"]

df = df.drop(columns=drop_cols, errors="ignore")

In [None]:
y = df["Churn Value"]
X = df.drop(columns=["Churn Value"], errors="ignore")

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [None]:
import numpy as np

In [None]:
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

In [None]:
X_train

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,...,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges
5963,United States,California,Orange,92869,33.792791,-117.789749,Female,No,No,No,...,No,No,No,Yes,No,Month-to-month,No,Electronic check,80.20,384.25
596,United States,California,Oak View,93022,34.404544,-119.302118,Female,Yes,No,No,...,No,Yes,No,Yes,No,Month-to-month,Yes,Electronic check,86.85,220.95
1836,United States,California,Encinitas,92024,33.054579,-117.256650,Female,No,Yes,No,...,No,No,No,No,No,Month-to-month,Yes,Credit card (automatic),75.15,216.75
3535,United States,California,Corona,92881,33.833686,-117.513063,Female,No,No,No,...,No,Yes,Yes,Yes,Yes,One year,No,Credit card (automatic),80.55,4847.05
228,United States,California,San Bruno,94066,37.624436,-122.430661,Female,No,No,No,...,No,No,Yes,Yes,Yes,Month-to-month,Yes,Bank transfer (automatic),98.90,1120.95
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5006,United States,California,Benicia,94510,38.113533,-122.119260,Male,No,No,No,...,Yes,Yes,Yes,Yes,No,One year,No,Credit card (automatic),104.50,6590.80
2964,United States,California,Chico,95928,39.681488,-121.837210,Male,No,No,No,...,No,No,No,No,No,Month-to-month,No,Mailed check,51.25,51.25
6507,United States,California,Zenia,95595,40.170357,-123.417298,Female,No,Yes,No,...,Yes,Yes,Yes,Yes,Yes,Two year,No,Electronic check,109.25,7707.70
3870,United States,California,Novato,94947,38.112166,-122.634384,Female,No,Yes,No,...,No internet service,No internet service,No internet service,No internet service,No internet service,One year,No,Credit card (automatic),20.40,482.80


In [None]:
# Get the data types of the categorical features
categorical_dtypes = X[categorical_features].dtypes

# Filter for features with 'object' dtype (which represents strings)
string_categorical_features = categorical_dtypes[categorical_dtypes == 'object'].index.tolist()

print("Categorical features with string dtype:")
print(string_categorical_features)

Categorical features with string dtype:
['Country', 'State', 'City', 'Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method', 'Total Charges']


In [None]:
import pandas as pd
import numpy as np

def convert_to_numeric_categories(df):
    """
    Converts all non-numeric columns in a pandas DataFrame
    into numeric categorical codes, handling stray spaces and NaNs.
    """
    df_converted = df.copy()

    # Replace empty strings or spaces with NaN
    df_converted = df_converted.replace(r'^\s*$', np.nan, regex=True)

    # Convert non-numeric columns to categorical codes
    for col in df_converted.columns:
        if not pd.api.types.is_numeric_dtype(df_converted[col]):
            df_converted[col] = df_converted[col].astype('category').cat.codes

    # Ensure all are numeric
    df_converted = df_converted.apply(pd.to_numeric, errors='coerce')

    return df_converted



In [None]:
X_train=convert_to_numeric_categories(X_train)

In [None]:
y_train

Unnamed: 0,Churn Value
5963,0
596,1
1836,1
3535,0
228,1
...,...
5006,0
2964,0
6507,0
3870,0


In [None]:
X_train

Unnamed: 0,Country,State,City,Zip Code,Latitude,Longitude,Gender,Senior Citizen,Partner,Dependents,...,Online Backup,Device Protection,Tech Support,Streaming TV,Streaming Movies,Contract,Paperless Billing,Payment Method,Monthly Charges,Total Charges
5963,0,0,712,92869,33.792791,-117.789749,0,0,0,0,...,0,0,0,2,0,0,0,2,80.20,996
596,0,0,695,93022,34.404544,-119.302118,0,1,0,0,...,0,2,0,2,0,0,1,2,86.85,676
1836,0,0,304,92024,33.054579,-117.256650,0,0,1,0,...,0,0,0,0,0,0,1,1,75.15,663
3535,0,0,218,92881,33.833686,-117.513063,0,0,0,0,...,0,2,2,2,2,1,0,1,80.55,3778
228,0,0,862,94066,37.624436,-122.430661,0,0,0,0,...,0,0,2,2,2,0,1,0,98.90,1953
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5006,0,0,72,94510,38.113533,-122.119260,1,0,0,0,...,2,2,2,2,0,1,0,1,104.50,4310
2964,0,0,184,95928,39.681488,-121.837210,1,0,0,0,...,0,0,0,0,0,0,0,3,51.25,164
6507,0,0,1123,95595,40.170357,-123.417298,0,0,1,0,...,2,2,2,2,2,2,0,2,109.25,4539
3870,0,0,689,94947,38.112166,-122.634384,0,0,1,0,...,1,1,1,1,1,1,0,1,20.40,1161


In [None]:
!pip install tabpfn torch


Collecting tabpfn
  Downloading tabpfn-6.0.5-py3-none-any.whl.metadata (38 kB)
Collecting eval-type-backport>=0.2.2 (from tabpfn)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting tabpfn-common-utils>=0.1.8 (from tabpfn-common-utils[telemetry-interactive]>=0.1.8->tabpfn)
  Downloading tabpfn_common_utils-0.2.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kditransform>=1.2 (from tabpfn)
  Downloading kditransform-1.2.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog~=6.7 (from tabpfn-common-utils>=0.1.8->tabpfn-common-utils[telemetry-interactive]>=0.1.8->tabpfn)
  Downloading posthog-6.9.0-py3-none-any.whl.metadata (6.0 kB)
Collecting requests (from huggingface-hub<2,>=0.19.0->tabpfn)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting backoff>=1.10.0 (from posthog~=6.7->tabpfn-common-utils>=0.1.8->tabpfn-common-utils[telemetry-interactive]>=0.1.8->tabpfn)
  Downloading backoff-2.2.1-py3-none-any.whl.metadata (14 kB)
Down

In [None]:
import torch
import pandas as pd
from tabpfn import TabPFNClassifier


In [None]:
# Create the classifier (v2 model)
from tabpfn import TabPFNClassifier
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
clf = TabPFNClassifier(model_path="/path/to/tabpfn-v2-classifier-v2_default.ckpt", device=device)
clf.fit(X_train, y_train)



In [None]:
y_pred=clf.predict(X_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))


Accuracy: 0.8328600405679513
              precision    recall  f1-score   support

           0       0.86      0.92      0.89      3622
           1       0.72      0.60      0.66      1308

    accuracy                           0.83      4930
   macro avg       0.79      0.76      0.77      4930
weighted avg       0.83      0.83      0.83      4930



In [None]:
X_test=convert_to_numeric_categories(X_test)

  df_converted = df_converted.replace(r'^\s*$', np.nan, regex=True)


In [None]:
y_test_pred=clf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Accuracy: 0.8116422148603881
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      1552
           1       0.67      0.58      0.62       561

    accuracy                           0.81      2113
   macro avg       0.76      0.74      0.75      2113
weighted avg       0.81      0.81      0.81      2113

