<a href="https://colab.research.google.com/github/E22MCAG0044/Deep-face-/blob/main/Internship_project_churan_pridiction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [2]:
# Load the dataset
df = pd.read_csv('/content/customer_churn_large_dataset.csv')
df

Unnamed: 0,CustomerID,Name,Age,Gender,Location,Subscription_Length_Months,Monthly_Bill,Total_Usage_GB,Churn
0,1,Customer_1,63,Male,Los Angeles,17,73.36,236,0.0
1,2,Customer_2,62,Female,New York,1,48.76,172,0.0
2,3,Customer_3,24,Female,Los Angeles,5,85.47,460,0.0
3,4,Customer_4,36,Female,Miami,3,97.94,297,1.0
4,5,Customer_5,46,Female,Miami,19,58.14,266,0.0
...,...,...,...,...,...,...,...,...,...
59772,59773,Customer_59773,34,Male,Chicago,12,78.73,54,1.0
59773,59774,Customer_59774,20,Female,New York,7,40.27,425,0.0
59774,59775,Customer_59775,70,Male,Miami,9,47.53,196,1.0
59775,59776,Customer_59776,18,Male,New York,24,34.94,134,0.0


In [3]:
# Task 1: Data Preprocessing

# Handle missing data (assuming we want to impute missing values with the mean)
df.fillna(df.mean(), inplace=True)

  df.fillna(df.mean(), inplace=True)


In [4]:
# Separate categorical columns and numerical columns
categorical_cols = ['CustomerID', 'Age', 'Gender','Location','Subscription_Length_Months','Monthly_Bill','Total_Usage_GB']
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['Churn']]
numerical_cols


['Name']

In [5]:
# One-hot encode categorical variables
encoder = OneHotEncoder(drop='if_binary', sparse_output=False)
encoded_categorical_cols = encoder.fit_transform(df[categorical_cols])

In [6]:
# Get the names of the encoded features
encoded_feature_names = encoder.get_feature_names_out(categorical_cols)

In [7]:
# Create a DataFrame for the encoded categorical columns with appropriate column names
encoded_categorical_df = pd.DataFrame(encoded_categorical_cols, columns=encoded_feature_names)


In [8]:
# Debugging: Print shapes of dataframes
print("Numerical Data Shape:", df[numerical_cols].shape)
print("Encoded Categorical Data Shape:", encoded_categorical_df.shape)


Numerical Data Shape: (59777, 1)
Encoded Categorical Data Shape: (59777, 67310)


In [None]:
# Combine the numerical and encoded categorical data
X = pd.concat([df[numerical_cols], encoded_categorical_df], axis=1)
y = df['Churn']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Calculate mean and standard deviation for each feature in the training set
mean = X_train.mean(axis=0)
std_dev = X_train.std(axis=0)

# Apply feature scaling manually
X_train_scaled = (X_train - mean) / std_dev
X_test_scaled = (X_test - mean) / std_dev

In [None]:
# Task 3: Model Building

# Choose an appropriate machine learning algorithm (Random Forest Classifier in this example)
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Train and validate the selected model on the training dataset
y_pred = model.predict(X_test_scaled)

In [None]:
# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")

In [None]:
# Task 4: Model Optimization
# Fine-tune the model parameters using GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

best_model = grid_search.best_estimator_
best_model.fit(X_train_scaled, y_train)
best_y_pred = best_model.predict(X_test_scaled)

In [None]:
# Evaluate the best model
best_accuracy = accuracy_score(y_test, best_y_pred)
best_precision = precision_score(y_test, best_y_pred)
best_recall = recall_score(y_test, best_y_pred)
best_f1 = f1_score(y_test, best_y_pred)

print("Best Model Evaluation:")
print(f"Accuracy: {best_accuracy}")
print(f"Precision: {best_precision}")
print(f"Recall: {best_recall}")
print(f"F1-Score: {best_f1}")

In [None]:

# Task 5: Model Deployment (not implemented here as it requires deployment infrastructure)

# Additional tasks as needed for deployment

# Visualizations (you can add more visualizations as needed)
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
plt.imshow(conf_matrix, cmap='Blues', interpolation='nearest')
plt.title('Confusion Matrix')
plt.colorbar()
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.xticks([0, 1], ['Not Churn', 'Churn'])
plt.yticks([0, 1], ['Not Churn', 'Churn'])
plt.show()