In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

train = pd.read_csv("/kaggle/input/churn-detection/train.csv")
test = pd.read_csv('/kaggle/input/churn-detection/test.csv')    

# EDR

In [None]:
df['Churn'].value_counts(normalize=True)

In [None]:
sns.boxplot(x='Churn', y='tenure', data=df)
plt.title('Tenure vs Churn')
plt.xlabel('Churn')
plt.ylabel('Tenure')
plt.show()

In [None]:
sns.countplot(x='Contract', hue='Churn', data=df)
plt.title('Churn Distribution by Contract Type')
plt.xlabel('Contract Type')
plt.ylabel('Number of Customers')
plt.legend(title='Churn')
plt.show()

In [None]:
sns.boxplot(x='Churn', y='MonthlyCharges', data=df)
plt.title('Monthly Charges vs. Churn')
plt.show()

In [None]:
sns.scatterplot(data=df, x='tenure', y='MonthlyCharges', hue='Churn', alpha=0.6)
plt.title('Tenure vs. Monthly Charges (Churn Highlighted)')
plt.xlabel('Tenure')
plt.ylabel('Monthly Charges ($)')
plt.legend(title='Churn')
plt.show()

In [None]:
sns.boxplot(x='Contract', y='MonthlyCharges', data=df)
plt.title('Monthly Charges by Contract Type')
plt.xlabel('Contract Type')
plt.ylabel('Monthly Charges ($)')
plt.show()

In [None]:
pd.crosstab(df['StreamingMovies'], df['Churn'], normalize='index') * 100

# Preprocessing

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').fillna(0)

In [None]:
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
df[binary_cols] = df[binary_cols].apply(lambda x: x.map({'Yes': 1, 'No': 0}))

In [None]:
contract_map = {'Month-to-month': 0, 'One year': 1, 'Two year': 2}
df['Contract'] = df['Contract'].map(contract_map)

In [None]:
cat_cols = ['gender', 'MultipleLines', 'InternetService', 'OnlineSecurity',
            'OnlineBackup', 'DeviceProtection', 'TechSupport', 
            'StreamingTV', 'StreamingMovies', 'PaymentMethod']
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [None]:
df['tenure_to_charge_ratio'] = df['tenure'] / (df['MonthlyCharges'] + 1)
df['avg_charge_per_month'] = df['TotalCharges'] / (df['tenure'] + 1)

# Training

In [2]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

In [None]:
X = preprocess(train.drop('Churn', axis=1))
y = train['Churn'].map({'Yes': 1, 'No': 0})

In [None]:
scale_pos_weight = len(y[y == 0]) / len(y[y == 1])

xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    eval_metric='logloss',
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    learning_rate=0.1,
    n_estimators=200,
    random_state=42
)

xgb.fit(X, y)

In [None]:
    test['TotalCharges'] = pd.to_numeric(test['TotalCharges'], errors='coerce').fillna(0)
    test[binary_cols] = test[binary_cols].apply(lambda x: x.map({'Yes': 1, 'No': 0}))


    test['Contract'] = test['Contract'].map(contract_map)
    test = pd.get_dummies(test, columns=cat_cols, drop_first=True)


    test['tenure_to_charge_ratio'] = test['tenure'] / (test['MonthlyCharges'] + 1)
    test['avg_charge_per_month'] = test['TotalCharges'] / (test['tenure'] + 1)

In [None]:
test = test[X.columns]



test_probas = xgb.predict_proba(test)[:, 1]
test_preds = (test_probas >= 0.5).astype(int)

# Submission

In [None]:
submission = pd.DataFrame({
    'id': test['id'],
    'Churn': ['Yes' if x == 1 else 'No' for x in test_preds]
})
submission.to_csv('sub.csv', index=False)