<a href="https://colab.research.google.com/github/Bits-Deep-Analytics/PFA/blob/main/EDA.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open this notebook in Colaboratory"/></a>

### Read and Merge

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import scipy.stats as st
import plotly.express as px

test = pd.read_csv('https://raw.githubusercontent.com/Bits-Deep-Analytics/PFA/main/churn-bigml-20.csv')
train = pd.read_csv('https://raw.githubusercontent.com/Bits-Deep-Analytics/PFA/main/churn-bigml-80.csv')
complete = [train, test]
data = pd.concat(complete)
data.head(3)
print(data.columns)
print(data['Churn'].value_counts())

### Statistical analysis & Visualization

In [None]:
sns.set_style("whitegrid")
sns.FacetGrid(data, hue='Churn', height=4).map(plt.scatter, 'Total eve charge', 'Total day charge').add_legend();
plt.show()

In [None]:
fig = px.scatter_3d(data, x='Total day minutes', y='Total eve minutes', z='Total night minutes',
              color='Churn',width=900, height=900)
fig.show()

In [None]:
sns.set_style('whitegrid')
ss = data.iloc[:,[6,9,12,19]]
sns.pairplot(ss,hue='Churn',diag_kind='kde')
plt.show()

In [None]:
sns.FacetGrid(data,hue='Churn',height=8).map(sns.histplot,'Total day minutes', bins=50).add_legend()
plt.show()

In [None]:
f,x = np.histogram(data['Total day minutes'],bins = 50, density = False)
pdf = f/sum(f)
plt.plot(x[1:],pdf)
cdf = np.cumsum(pdf)
plt.plot(x[1:],cdf)
plt.show()

In [None]:
print(np.mean(data['Total day minutes']))
print(np.median(data['Total day minutes']))
print(np.std(data['Total day minutes']))
print(np.percentile(data['Total day minutes'],np.arange(0,100,25)))
print(np.percentile(data['Total day minutes'],90))
from statsmodels import robust
print(robust.mad(data['Total day minutes']))


### Draw diagrams

In [None]:
ct_ci_ch = pd.crosstab(data['International plan'],data['Churn'])
ct_ci_ip = pd.crosstab(data['Churn'],data['International plan'])
print(ct_ci_ch,"\n")
ct_ci_m = pd.crosstab(data['Churn'], data['International plan'], margins = True)
print(ct_ci_m)

In [None]:
ct_ci_ip.plot(kind = 'bar', stacked = True) # Example of stacked barplot
plt.show()

In [None]:
ct_ci_ch.plot(kind = 'bar', stacked = False)
plt.show()

In [None]:
ct_ci_ip.plot(kind = 'bar', stacked = False)
plt.show()

In [None]:
ct_ci_nr = pd.crosstab(data['International plan'],data['Churn'], 
                       normalize = 'columns')
ct_ci_nr = ct_ci_nr
print(ct_ci_nr.transpose(),"\n")
ct_ci_nc = pd.crosstab(data['Churn'], data['International plan'], 
                       normalize = 'columns')
ct_ci_nc = ct_ci_nc
print(ct_ci_nc)

In [None]:
ct_ci_nr.plot(kind = 'pie', subplots = True, legend = False)
plt.show()

In [None]:
ct_ci_nc.plot(kind = 'pie', subplots = True, legend = False)
plt.show()

In [None]:
plt.figure(figsize=(8, 8))
plt.hist(data['Customer service calls'])
plt.show()

In [None]:
sns.violinplot(x = data['Churn'], y = data['Total eve minutes'])
plt.show()

In [None]:
sns.boxplot(x = data['Churn'], y = data['Total eve minutes'])
plt.show()

In [None]:
sns.jointplot(data = data, x = 'Total eve minutes', y = 'Total night minutes', kind="hist")
plt.show();

In [None]:
plt.figure(figsize=(8, 8))
plt.hist([data['Total eve minutes'], data['Total day minutes']], bins = 100, 
         stacked=True, density=False)
plt.show()

In [None]:
(counts, bins) = np.histogram(data['Customer service calls'], bins=range(11))
factor = 1/sum(counts)
plt.figure(figsize=(8, 8))
plt.hist(bins[:-1], bins, weights=factor*counts)
plt.show()

In [None]:
sns.set(rc={"figure.figsize":(8, 8)})
sns.histplot(data, x=data['Customer service calls'], hue=data['Churn'], 
             multiple="stack", legend = True) # Set multiple="fill" to make your histogram normalized
plt.show()

In [None]:
sns.set(rc={"figure.figsize":(8, 8)})
sns.histplot(data, x=data['Customer service calls'], hue=data['Churn'], 
             multiple="fill", legend = True) 
plt.show()

In [None]:
churners = data[data['Churn'] == True]
non_churners = data[data['Churn'] == False]
c_ic = churners['Total intl calls']
nc_ic = non_churners['Total intl calls']
from scipy.stats import ttest_ind
t_stat, p_value = ttest_ind(c_ic, nc_ic)
print("T-statistic value: ", t_stat)
print("P-Value: ", p_value)

In [None]:
sns.set(rc={"figure.figsize":(8, 8)})
g = sns.scatterplot(x = data['Total eve minutes'], y = data['Total day minutes'],
                   hue = data['Churn'], s = 15);

In [None]:
sns.set(rc={"figure.figsize":(8, 8)})
g =sns.scatterplot(x=data['Total day minutes'], y=data['Customer service calls'],
                   hue=data['Churn'], s = 15);

In [None]:
px.scatter_matrix(data.iloc[:, [6,7,8]],width=550, height=450)

In [None]:
mins = data['Total day minutes']
calls = data['Total day calls']
charge = data['Total day charge']

print(st.pearsonr(mins,calls))
print(st.pearsonr(mins,charge))
print(st.pearsonr(calls,charge))

In [None]:
model = LinearRegression()
x = data['Total day minutes']
y = data['Total eve minutes']

slope, intercept, r, p, std_err = st.linregress(x, y)
x = np.array(x).reshape((-1, 1))
y = np.array(y)
model.fit(x, y)
model = LinearRegression().fit(x, y)
r_sq = model.score(x, y)
print(f"coefficient of determination: {r_sq}")
print(f"intercept: {model.intercept_}")
print(f"slope: {model.coef_}")
y_pred = model.predict([[5]])
print(f"predicted response:\n{y_pred}")

def myfunc(x):
  return slope * x + intercept

mymodel = list(map(myfunc, x))
plt.figure(figsize=(8, 8))
plt.scatter(x, y, s = 5)
plt.plot(x, mymodel)
plt.show()
