In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures

from visualization_helpers import plot_log_histograms, plot_nonzero_hist_log_values
from data_cleaning_helpers import clean_dataframe

In [None]:
df = pd.read_csv('../data/telecom/cell2celltrain.csv')

clean_dataframe(df)

df.info()

# Further data exploration and feature engineering

In [None]:
# Now that we have a basic understanding of the data distribution we will explore the data even more.
LOG_COLUMNS = [
    'MonthlyRevenue',
    'MonthlyMinutes',
    'DroppedCalls',
    'UnansweredCalls',
    'PeakCallsInOut',
]


plot_log_histograms(df, LOG_COLUMNS)

# We can observe that a lot of data is condensed at zero

In [None]:
ZERO_IMBALANCE_COLUMNS = [
    'DirectorAssistedCalls',
    'OverageMinutes',
    'RoamingCalls',
    'DroppedCalls',
    'ReceivedCalls',
]

plot_nonzero_hist_log_values(df, ZERO_IMBALANCE_COLUMNS)

# Multivariate Analysis

In [None]:
USER_BEHAVIOUR = [
    'Churn',
    'CurrentEquipmentDays',
    'MonthlyMinutes',
    'MonthsInService',
    'PercChangeMinutes',
]

fig = sns.pairplot(df[USER_BEHAVIOUR], hue = 'Churn')
fig.savefig("../plots/user_behaviour.png") 

USER_DATA = [
    'Churn',
    'AgeHH1',
    'AgeHH2',
    'HasCreditCard',
    'CreditRating',
    'AdjustmentsToCreditRating',
]

fig = sns.pairplot(df[USER_DATA], hue = 'Churn')
fig.savefig("../plots/user_data.png")

COSTS_DATA = [
    'Churn',
    'MonthlyRevenue',
    'TotalRecurringCharge',
    'PercChangeRevenues',
    'RoamingCalls',
    'OverageMinutes',
]

fig = sns.pairplot(df[COSTS_DATA], hue = 'Churn')
fig.savefig("../plots/cost_data.png")


CARE_SERVICE = [
    'Churn',
    'RetentionCalls',
    'CustomerCareCalls',
    'DirectorAssistedCalls',
    'RespondsToMailOffers',
    'ReferralsMadeBySubscriber',
    'RetentionOffersAccepted',
]

fig = sns.pairplot(df[CARE_SERVICE], hue = 'Churn')
fig.savefig("../plots/care_service.png")

In [None]:
def display_correlation(df):
    df_copy = df.copy()
    label_encoder = LabelEncoder()
    for column in df.select_dtypes(include='object'):
        df_copy[column] = label_encoder.fit_transform(df_copy[column])
    
    corr = df_copy.corr()
    plt.figure(figsize=(20,10))
    sns.heatmap(corr)
    plt.show()

In [None]:
display_correlation(df)

# Feature engineering

In [None]:
def square_features(df, columns):
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_data = poly.fit_transform(df[columns])
    poly_columns = poly.get_feature_names_out(columns)
    df_poly = pd.DataFrame(poly_data, columns=poly_columns)
    return df_poly, poly_columns


def square_and_concat_features(df, columns):
    df_poly, poly_columns = square_features(df, columns)
    df = pd.concat([df, df_poly], axis=1)
    return df, poly_columns

columns = ['MonthlyMinutes', 'TotalRecurringCharge']
df, poly_columns = square_and_concat_features(df, columns)