In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
file_path = 'Telecom_customer churn.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# 1. Drop columns with more than 30% missing values
missing_values = data.isnull().mean() * 100
columns_to_drop = missing_values[missing_values > 30].index
data_cleaned = data.drop(columns=columns_to_drop)

# 2. Impute missing values
# Separate numerical and categorical columns
num_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data_cleaned.select_dtypes(include=['object']).columns

# Impute numerical columns with median and categorical columns with mode
data_cleaned[num_cols] = data_cleaned[num_cols].fillna(data_cleaned[num_cols].median())
data_cleaned[cat_cols] = data_cleaned[cat_cols].fillna(data_cleaned[cat_cols].mode().iloc[0])

# 3. Encode categorical variables
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    data_cleaned[col] = label_encoders[col].fit_transform(data_cleaned[col])

# 4. Separate features and target variable
X = data_cleaned.drop(columns=['churn', 'Customer_ID'])
y = data_cleaned['churn']

# Select top features
selector = SelectKBest(f_classif, k=20)  # Adjust k based on analysis
X_selected = selector.fit_transform(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model from Grid Search
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test_scaled)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


NameError: name 'SelectKBest' is not defined

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Load and clean dataset
file_path = 'Telecom_customer churn.csv' # Replace with your file path
data = pd.read_csv(file_path)

# Drop columns with more than 30% missing values
missing_values = data.isnull().mean() * 100
columns_to_drop = missing_values[missing_values > 30].index
data_cleaned = data.drop(columns=columns_to_drop)

# Impute missing values
num_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[num_cols] = data_cleaned[num_cols].fillna(data_cleaned[num_cols].median())
data_cleaned[cat_cols] = data_cleaned[cat_cols].fillna(data_cleaned[cat_cols].mode().iloc[0])

# Encode categorical variables
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    data_cleaned[col] = label_encoders[col].fit_transform(data_cleaned[col])

# Separate features and target
X = data_cleaned.drop(columns=['churn', 'Customer_ID'])
y = data_cleaned['churn']

# Select top features
selector = SelectKBest(f_classif, k=20)  # Adjust k based on analysis
X_selected = selector.fit_transform(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Best model from Grid Search
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test_scaled)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

# Load and clean dataset
file_path = 'Telecom_customer churn.csv' # Replace with your file path
data = pd.read_csv(file_path)

# Drop columns with more than 30% missing values
missing_values = data.isnull().mean() * 100
columns_to_drop = missing_values[missing_values > 30].index
data_cleaned = data.drop(columns=columns_to_drop)



In [7]:
# Impute missing values
num_cols = data_cleaned.select_dtypes(include=['float64', 'int64']).columns
cat_cols = data_cleaned.select_dtypes(include=['object']).columns
data_cleaned[num_cols] = data_cleaned[num_cols].fillna(data_cleaned[num_cols].median())
data_cleaned[cat_cols] = data_cleaned[cat_cols].fillna(data_cleaned[cat_cols].mode().iloc[0])

In [8]:
# Encode categorical variables
label_encoders = {}
for col in cat_cols:
    label_encoders[col] = LabelEncoder()
    data_cleaned[col] = label_encoders[col].fit_transform(data_cleaned[col])

In [9]:
# Separate features and target
X = data_cleaned.drop(columns=['churn', 'Customer_ID'])
y = data_cleaned['churn']

# Select top features
selector = SelectKBest(f_classif, k=20)  # Adjust k based on analysis
X_selected = selector.fit_transform(X, y)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Hyperparameter tuning for Random Forest
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

In [None]:
# Best model from Grid Search
best_model = grid_search.best_estimator_

# Evaluate the model
y_pred = best_model.predict(X_test_scaled)
print("Best Parameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))