In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load your dataset
dataset = pd.read_csv('/content/telecom_churn.csv')

# 1. Drop irrelevant columns
dataset = dataset.drop(['customer_id', 'pincode'], axis=1)

# 2. Convert 'date_of_registration' to datetime and extract features
dataset['date_of_registration'] = pd.to_datetime(dataset['date_of_registration'])
today = pd.to_datetime("today")
dataset['days_since_registration'] = (today - dataset['date_of_registration']).dt.days
dataset = dataset.drop('date_of_registration', axis=1)

# 3. Fix negative values in 'data_used'
dataset['data_used'] = dataset['data_used'].apply(lambda x: max(x, 0))

# 4. One-hot encode categorical columns
dataset = pd.get_dummies(dataset, columns=['telecom_partner', 'gender', 'state', 'city'], drop_first=True)

# 5. Separate features and target
X = dataset.drop('churn', axis=1)
y = dataset['churn']

# 6. Split into train/test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 7. Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

model = RandomForestClassifier()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7991624068485558
Confusion Matrix:
 [[38928     0]
 [ 9783     0]]


In [None]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, model.predict_proba(X_test_scaled)[:, 1]))

[[38928     0]
 [ 9783     0]]
              precision    recall  f1-score   support

           0       0.80      1.00      0.89     38928
           1       0.00      0.00      0.00      9783

    accuracy                           0.80     48711
   macro avg       0.40      0.50      0.44     48711
weighted avg       0.64      0.80      0.71     48711



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ROC AUC Score: 0.5030008484777293


In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

In [None]:
df=pd.read_csv('/content/churn_data_with_predictions.csv')
df.head()

Unnamed: 0,age,num_dependents,estimated_salary,calls_made,sms_sent,data_used,churn,days_since_registration,telecom_partner_BSNL,telecom_partner_Reliance Jio,...,state_Tripura,state_Uttar Pradesh,state_Uttarakhand,state_West Bengal,city_Chennai,city_Delhi,city_Hyderabad,city_Kolkata,city_Mumbai,churn_prediction
0,25,4,124962,44,45,0,0,1959,False,True,...,False,False,False,False,False,False,False,True,False,0
1,55,2,130556,62,39,5973,0,1959,False,True,...,False,False,False,False,False,False,False,False,True,0
2,57,0,148828,49,24,193,1,1959,False,False,...,False,False,False,False,False,True,False,False,False,1
3,46,1,38722,80,25,9377,1,1959,True,False,...,False,False,False,False,False,False,False,True,False,1
4,26,2,55098,78,15,1393,0,1959,True,False,...,True,False,False,False,False,True,False,False,False,0


In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv("/content/churn_data_with_predictions.csv")  # Replace with actual filename

# Reverse one-hot encoding for telecom_partner
telecom_partner_columns = [col for col in df.columns if col.startswith("telecom_partner_")]
df["telecom_partner"] = df[telecom_partner_columns].idxmax(axis=1).str.replace("telecom_partner_", "")

# Reverse one-hot encoding for state
state_columns = [col for col in df.columns if col.startswith("state_")]
df["state"] = df[state_columns].idxmax(axis=1).str.replace("state_", "")

# Reverse one-hot encoding for city
city_columns = [col for col in df.columns if col.startswith("city_")]
df["city"] = df[city_columns].idxmax(axis=1).str.replace("city_", "")

# Drop the one-hot encoded columns (optional)
df.drop(columns=telecom_partner_columns + state_columns + city_columns, inplace=True)
churn_column = df.pop("churn")
df["churn"] = churn_column

# Save to new file for Power BI import
df.to_csv("telecom_chudrn_project.csv", index=False)



In [None]:
dat=pd.read_csv()

   age  num_dependents  estimated_salary  calls_made  sms_sent  data_used  \
0   25               4            124962          44        45          0   
1   55               2            130556          62        39       5973   
2   57               0            148828          49        24        193   
3   46               1             38722          80        25       9377   
4   26               2             55098          78        15       1393   

   days_since_registration  gender_M  churn_prediction telecom_partner  \
0                     1959     False                 0    Reliance Jio   
1                     1959     False                 0    Reliance Jio   
2                     1959     False                 1        Vodafone   
3                     1959      True                 1            BSNL   
4                     1959     False                 0            BSNL   

               state     city  churn  
0          Karnataka  Kolkata      0  
1            M