In [None]:
def setup_environment():
    try:
        import pandas as pd
        import imblearn
        from imblearn.over_sampling import SMOTE
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.feature_selection import SelectFromModel
        from sklearn.cluster import KMeans
        import matplotlib.pyplot as plt
        from sklearn.model_selection import train_test_split

    except ImportError as e:
        print(f"Installing missing package: {e.name}")
        !pip install {e.name}
        print("Package installed successfully.")

# Calling the setup function to ensure required packages are installed
setup_environment()

In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import pandas as pd
import seaborn as sns
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,classification_report,confusion_matrix,roc_auc_score




unlabeled_data = pd.read_csv('features-train_split-part_00.csv')
unlabeled_data.columns = ['id'] + [f'feature{i}' for i in range(1, len(unlabeled_data.columns))]
labeled_data = pd.read_csv('clients-train_split.csv')

unlabeled_data.head()

merged_data = pd.merge(unlabeled_data, labeled_data, on='id', how='inner')
merged_data['churner'] = merged_data['churner'].astype(int)
merged_data

In [None]:
correlation_result = merged_data.corr()['churner'].sort_values()
columns_to_drop = correlation_result[correlation_result.isna() | (correlation_result.abs() < 0.01)].index
data = merged_data.drop(columns=columns_to_drop)
data

In [None]:

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, Y__train)

feature_importances = pd.Series(rf_classifier.feature_importances_, index=X_train.columns)

top_features = feature_importances.nlargest(100).index
important_data=merged_data[top_features]
important_data


intersection = set(important_data).intersection(data)
data = merged_data[intersection]


In [None]:
# Assuming merged_data is your DataFrame
X = data.drop(['id','churner'], axis=1)  # Features
Y = data['churner']  # Target variable
data["churner"] = data["churner"].astype(int)
data.head()

In [None]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
test_data = pd.read_csv('features-test_split-part_00.csv')
test_data.columns = ['id'] + [f'feature{i}' for i in range(1, len(unlabeled_data.columns))]
X_test = data.drop(['id','churner'], axis=1)
Y_test = data['churner']
X_test

In [None]:
X_train = data.drop(['id','churner'], axis=1)
Y_train = data['churner']
X_train

In [18]:
from imblearn.over_sampling import ADASYN

#Apply ADASYN to handle imbalanced data
adasyn = ADASYN()
X_DownSampled_train, Y_DownSampled_train = adasyn.fit_resample(X_train, Y_train)

from imblearn.over_sampling import RandomOverSampler

# Apply RandomOverSampler to handle imbalanced data
#ros = RandomOverSampler()
#X_DownSampled_train, Y_DownSampled_train = ros.fit_resample(X_train, y_train)
#X_DownSampled_train

In [None]:
   models = {
    "LogisticRegression": LogisticRegression(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "GradientBoostingClassifier": GradientBoostingClassifier(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "KNeighborsClassifier": KNeighborsClassifier(), 
}
model_list = []
for i in range(len(list(models))):
    modell = list(models.values())[i]
    modell.fit(X_DownSampled_train,Y_DownSampled_train)

    y_train_preiction = modell.predict(X_DownSampled_train)
    y_test_prediction = modell.predict(X_test)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    

    print('Model performance for Training set')
    print("- accuracy score of train: {:.4f}".format(accuracy_score(Y_DownSampled_train,y_train_preiction)))
    print("- f1_score: {:.4f}".format(f1_score(Y_DownSampled_train,y_train_preiction)))
    print("- roc_auc_score : {:.4f}".format(roc_auc_score(Y_DownSampled_train,y_train_preiction)))
    print("- confusion_matrix : ")
    print(confusion_matrix(Y_DownSampled_train,y_train_preiction))
    print("- classification report : ")
    print(classification_report(Y_DownSampled_train,y_train_preiction))
    
    
    print("----------------------------------------------")

    print('Model performance for Testing set')
    print("- accuracy score of train: {:.4f}".format(accuracy_score(Y_test,y_test_prediction)))
    print("- f1_score: {:.4f}".format(f1_score(Y_test,y_test_prediction)))
    print("- roc_auc_score : {:.4f}".format(roc_auc_score(Y_test,y_test_prediction)))
    print("- confusion_matrix : ")
    print(confusion_matrix(Y_test,y_test_prediction))
    print("- classification report : ")
    print(classification_report(Y_test,y_test_prediction))

    print("="*30)
    print("\n")

LogisticRegression
Model performance for Training set
- accuracy score of train: 0.6282
- f1_score: 0.7287
- roc_auc_score : 0.6288
- confusion_matrix : 
[[ 35676 102741]
 [    17 137976]]
- classification report : 
              precision    recall  f1-score   support

           0       1.00      0.26      0.41    138417
           1       0.57      1.00      0.73    137993

    accuracy                           0.63    276410
   macro avg       0.79      0.63      0.57    276410
weighted avg       0.79      0.63      0.57    276410

----------------------------------------------
Model performance for Testing set
- accuracy score of train: 0.2668
- f1_score: 0.0321
- roc_auc_score : 0.6287
- confusion_matrix : 
[[ 44605 128410]
 [     1   2129]]
- classification report : 
              precision    recall  f1-score   support

           0       1.00      0.26      0.41    173015
           1       0.02      1.00      0.03      2130

    accuracy                           0.27    175