In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score

In [11]:
def preprocessing(dataset): 

    data = pd.read_csv(dataset)

    categorical_features = ['InternetService', 'PaymentMethod', 'OnlineSecurity']
    numerical_features = ['MonthlyCharges', 'TotalCharges']
    binary_features = ['gender', 'Partner', 'PhoneService', 'Contract']

    encoder = OneHotEncoder(sparse=False)
    ordinal_encoder = OrdinalEncoder()

    cat_features = encoder.fit_transform(data[categorical_features])
    bin_features = ordinal_encoder.fit_transform(data[binary_features])
    num_features = data[numerical_features].to_numpy()

    columns = np.array(encoder.categories_)
    binary_cols = np.array(binary_features)
    num_cols = np.array(numerical_features)
    columns = np.concatenate(tuple(columns))

    all_columns = np.concatenate((binary_cols, columns, num_cols))
    all_columns[6] = 'NoInternet'
    all_columns[-3] = 'OnlineSecurity'

    df = np.hstack((bin_features, cat_features, num_features))
    frame = pd.DataFrame(df, columns = all_columns)

    if 'Churn' in data.columns:
        encoder = LabelEncoder()
        labels = encoder.fit_transform(data['Churn'])
        frame['Churn'] = labels

    noisy_columns = frame[['gender', 'No internet service', 'PhoneService', 'OnlineSecurity','Credit card (automatic)', 'Mailed check']]

    frame['Bank transfer (automatic)'] = frame['Bank transfer (automatic)'] + frame['Credit card (automatic)']
    frame.rename(columns={'Bank transfer (automatic)':'AutomaticPayment'}, inplace=True)
    frame.rename(columns={'No': 'NoOnlineSecurity'}, inplace=True)
    ads = frame['PhoneService'] + frame['OnlineSecurity']

    time = frame['TotalCharges'] / frame['MonthlyCharges']
    frame['TotalCharges'] = time
    frame.rename(columns={'TotalCharges':'Months'}, inplace=True)

    frame.drop(columns=noisy_columns, inplace=True)
    frame['AddServices'] = ads

    if 'Churn' in data.columns:
        labels = frame['Churn'].to_numpy()
        frame.drop(columns=['Churn'], inplace=True)
    else:
        labels = None

    df = frame.to_numpy()

    return df, labels

In [2]:
def algo(dataset):
    df, labels = preprocessing("BSDS_November.csv")
    xgb = XGBClassifier(n_estimators=44, max_depth=4, min_child_weight=1e-05, gamma=3, learning_rate=0.08367510204081632)
    xgb.fit(df,labels)
    df2, lables2 = preprocessing(dataset)
    predicted = xgb.predict(df2)

    return predicted