## Install dependencies

In [3]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport


# Read dataset

In [4]:
train = pd.read_csv('../data/cell2celltrain.csv')
df = train.copy()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51047 entries, 0 to 51046
Data columns (total 58 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   CustomerID                 51047 non-null  int64  
 1   Churn                      51047 non-null  object 
 2   MonthlyRevenue             50891 non-null  float64
 3   MonthlyMinutes             50891 non-null  float64
 4   TotalRecurringCharge       50891 non-null  float64
 5   DirectorAssistedCalls      50891 non-null  float64
 6   OverageMinutes             50891 non-null  float64
 7   RoamingCalls               50891 non-null  float64
 8   PercChangeMinutes          50680 non-null  float64
 9   PercChangeRevenues         50680 non-null  float64
 10  DroppedCalls               51047 non-null  float64
 11  BlockedCalls               51047 non-null  float64
 12  UnansweredCalls            51047 non-null  float64
 13  CustomerCareCalls          51047 non-null  flo

## Handle Missing Numerical by assigning mean

In [5]:
num_columns_missing = [
    'Tenure',
    'WarehouseToHome',
    'HourSpendOnApp',
    'OrderAmountHikeFromlastYear',
    'CouponUsed',
    'OrderCount',
    'DaySinceLastOrder'
]

def handle_missing_values(column_name):
    # Adding column indicating missing value.
    df[f"{column_name}_missing"] = df[column_name].isnull().astype(int)
    
    # Filling missing values with mean.
    mean_value = df[column_name].mean()
    
    if df[column_name].dtype == 'int64':
        mean_value = round(mean_value)
    
    df[column_name].fillna(mean_value, inplace=True)


# for column in num_columns_missing:
#     handle_missing_values(column)

## YData Profiling Report

## Autoviz Report

## SWEETVIZ Report

## Data Analysis

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isnull().sum()

## Simple visualization

## Distinguish columns based on their cardinality

In [None]:
categorical_columns = df.select_dtypes(include='object')

counter_columns = []
histogram_columns = []

for column in categorical_columns:
    if df[column].nunique() <= 20:
        counter_columns.append(column)


for column in df.columns:
    if df[column].dtype == 'object':
        continue

    if df[column].nunique() <= 20:
        counter_columns.append(column)
    else:
        histogram_columns.append(column)

In [None]:
import seaborn as sns

def plot_log_histograms(df, columns):
    for column in columns:
        plt.figure(figsize=(8, 6))
        # Usuwamy wartości NaN
        values = df[column].dropna()
        # Dodajemy minimalną wartość + 1
        min_value = values.min()
        log_values = np.log(values - min_value + 1)
        plt.hist(log_values, bins=20, edgecolor='black')
        plt.title(f'Log Histogram for {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()
        
        
def plot_histograms(df, columns):
    for column in columns:
        plt.figure(figsize=(8, 6))
        df[column].plot(kind='hist', bins=20, edgecolor='black')
        plt.title(f'Histogram for {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()


def plot_bar_charts(df, columns):
    for column in columns:
        if df[column].nunique() > 20:
            continue

        plt.figure(figsize=(8, 6))
        df[column].value_counts().plot(kind='bar')
        plt.title(f'Bar Chart for {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.show()
        
def plot_nonzero_values(df, columns):
    for column in columns:
        non_zero_df = df[df[column] != 0]
        plt.figure(figsize=(8, 6))
        non_zero_df[column].plot(kind='hist', bins=20, edgecolor='black')
        plt.title(f'Non-zero-values for {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()
        

def plot_by_positive_negative(df, columns, churn_column):
    for column in columns:
        positive_values = df[df[column] > 0]
        negative_values = df[df[column] <= 0]

        plt.figure(figsize=(10, 6))
        sns.histplot(positive_values, x=column, hue=churn_column, multiple="stack", bins=20, edgecolor='black')
        plt.title(f'{column} Positive Values by Churn')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.legend(title='Churn', labels=['Not Churn', 'Churn'])
        plt.show()

        plt.figure(figsize=(10, 6))
        sns.histplot(negative_values, x=column, hue=churn_column, multiple="stack", bins=20, edgecolor='black')
        plt.title(f'{column} Negative Values by Churn')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.legend(title='Churn', labels=['Not Churn', 'Churn'])
        plt.show()

In [None]:
HISTOGRAM_COLUMNS = [
    'TotalRecurringCharge',
]

COUNTER_COLUMNS = [

]

FAJNE_LOG_COLUMNS = [
    'MonthlyRevenue',
    'MonthlyMinutes',
]

USUNIECIE_ZER = [
    
]


CURRENT_COLUMNS = [
    'DirectorAssistedCalls',
]




plot_histograms(df, CURRENT_COLUMNS)
print("==============================")
plot_log_histograms(df, CURRENT_COLUMNS)
print("==============================")
# plot_bar_charts(df, LOG_COLUMNS)
print("==============================")
plot_nonzero_values(df, CURRENT_COLUMNS)
print("==============================")
plot_by_positive_negative(df, CURRENT_COLUMNS, 'Churn')
print("==============================")


# Helper functions for visualization

In [None]:
def plot_pie_charts(df, columns):
    for column in columns:
        plt.figure(figsize=(8, 6))
        df[column].value_counts().plot(kind='pie', autopct='%1.1f%%')
        plt.title(f'Pie Chart for {column}')
        plt.ylabel('')
        plt.show()

plot_pie_charts(df, categorical_columns)

In [None]:
def plot_bar_charts(df, columns):
    for column in columns:
        if df[column].nunique() > 20:
            continue
            
        plt.figure(figsize=(8, 6))
        df[column].value_counts().plot(kind='bar')
        plt.title(f'Bar Chart for {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.show()


plot_bar_charts(df, counter_columns)

In [None]:
def plot_histograms(df, columns):
    for column in columns:
        plt.figure(figsize=(8, 6))
        df[column].plot(kind='hist', bins=20, edgecolor='black')
        plt.title(f'Histogram for {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.show()
        
plot_histograms(df, histogram_columns)

In [None]:
def plot_bar_and_column_charts(df, columns):
    num_cols = len(columns)
    fig, axes = plt.subplots(nrows=1, ncols=num_cols, figsize=(12, 6))

    for i, column in enumerate(columns):
        df[column].value_counts().plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'Bar Chart for {column}')
        axes[i].set_xlabel(column)
        axes[i].set_ylabel('Count')

    plt.tight_layout()
    plt.show()
        
    
plot_bar_and_column_charts(df, categorical_columns)

In [None]:
import seaborn as sns

def plot_stripplot(df, columns):
    for column in columns:
        plt.figure(figsize=(8, 6))
        sns.stripplot(x=df[column])
        plt.title(f'Strip Plot for {column}')
        plt.xlabel(column)
        plt.ylabel('Count')
        plt.show()
    
    
plot_stripplot(df, histogram_columns)

In [None]:
columns = df.columns
print(columns)

In [None]:
# churn = 'Churn'
# 
# colors = {0: 'blue', 1: 'red'}
# 
# %matplotlib inline
# 
# for column1 in df.columns:
#     for column2 in df.columns:
#         if column1 == column2 or column1 == churn or column2 == churn:
#             continue
#        
#         plt.figure(figsize=(10, 6))
#         plt.scatter(df[column1], df[column2], color=df[churn].map(colors))
#         plt.xlabel(column1)
#         plt.ylabel(column2)
#         plt.title('Wykres z kolorowaniem na podstawie churn')
#         plt.legend(labels=['churn=0', 'churn=1'])
#         plt.grid(True)
#         plt.show() 
#         

Investigating data

In [17]:
clients_who_had_call_with_director = df[df['CustomerCareCalls'] > 0]

clients_who_had_call_with_director = pd.DataFrame(clients_who_had_call_with_director['CustomerID'].unique())
clients_who_had_call_with_director



Unnamed: 0,0
0,3000022
1,3000030
2,3000042
3,3000058
4,3000062
...,...
23022,3399898
23023,3399910
23024,3399922
23025,3399978
