## Preprocessing

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

### Import Data

In [2]:
df = pd.read_csv('tele_clean.csv')
df.drop(columns = ['customerID', 'gender'], inplace = True)
df.head()

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,PhoneService,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,Churn,ExtraCharges
0,No,Yes,No,1,No,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No,0.0
1,No,No,No,34,OneLine,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,No,-46.8
2,No,No,No,2,OneLine,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,Yes,0.45
3,No,No,No,45,No,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,No,-62.75
4,No,No,No,2,OneLine,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,Yes,10.25


In [3]:
df.dtypes

SeniorCitizen        object
Partner              object
Dependents           object
tenure                int64
PhoneService         object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
Churn                object
ExtraCharges        float64
dtype: object

## OneHotEncoding

In [4]:
y = df[['Churn']]
X = df.drop(columns = 'Churn')
X = pd.get_dummies(X, columns = ['SeniorCitizen', 'Partner',
                                'Dependents', 'PhoneService', 'InternetService',
                                'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                'TechSupport', 'StreamingTV', 'StreamingMovies',
                                'Contract', 'PaperlessBilling', 'PaymentMethod'], drop_first = True)
print(X.shape)
print(X.columns)

(7043, 28)
Index(['tenure', 'MonthlyCharges', 'ExtraCharges', 'SeniorCitizen_Yes',
       'Partner_Yes', 'Dependents_Yes', 'PhoneService_No',
       'PhoneService_OneLine', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No internet service',
       'OnlineSecurity_Yes', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No internet service',
       'DeviceProtection_Yes', 'TechSupport_No internet service',
       'TechSupport_Yes', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No internet service', 'StreamingMovies_Yes',
       'Contract_One year', 'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Credit card (automatic)',
       'PaymentMethod_Electronic check', 'PaymentMethod_Mailed check'],
      dtype='object')


### Train Test Split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123, stratify = y)

### Standardizing

In [6]:
num_cols = df.select_dtypes(include = ['int64', 'float64']).columns
sc = StandardScaler().fit(X_train[num_cols])
X_train[num_cols] = sc.transform(X_train[num_cols], copy = False)
X_test[num_cols] = sc.transform(X_test[num_cols], copy = False)
X_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set 

Unnamed: 0,tenure,MonthlyCharges,ExtraCharges,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_No,PhoneService_OneLine,InternetService_Fiber optic,InternetService_No,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
1487,0.964787,1.402679,-0.995859,0,0,0,0,0,1,0,...,0,1,0,1,1,0,1,0,1,0
1485,-0.416288,1.066588,-0.668443,1,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,1,0
4731,-0.538148,-1.489029,-0.309703,0,1,1,0,1,0,1,...,1,0,1,0,1,0,0,0,0,1
5987,-1.269305,-1.474055,-0.005408,0,1,1,0,1,0,1,...,1,0,1,0,0,1,0,0,0,1
5058,-0.091329,-0.527345,1.079762,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,1


In [7]:
X_test.head()

Unnamed: 0,tenure,MonthlyCharges,ExtraCharges,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_No,PhoneService_OneLine,InternetService_Fiber optic,InternetService_No,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
3871,-1.188066,1.227978,-0.099381,0,0,0,0,0,1,0,...,0,1,0,1,0,0,1,0,0,0
6975,-0.335048,-0.009899,0.207151,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2649,1.289746,-1.31932,-0.232138,0,1,0,0,0,0,1,...,1,0,1,0,0,1,0,1,0,0
4061,-0.741247,-0.484086,0.115415,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4076,1.533465,1.154771,-0.154572,0,1,0,0,0,1,0,...,0,1,0,0,0,1,1,1,0,0


### Oversampling

In [14]:
from collections import Counter
print('Original dataset shape:' + str(y_train.shape))
sm = SMOTE(random_state=123)
X_res, y_res = sm.fit_resample(X_train, y_train)
print('Resampled dataset shape:' + str(y_res.shape))

Original dataset shape:(5282, 1)
Resampled dataset shape:(7760, 1)


In [15]:
train = pd.concat([X_res, y_res], axis = 1)
train.to_csv('train.csv', header = True, index = False)
test = pd.concat([X_test, y_test], axis = 1)
test.to_csv('test.csv', header = True, index = False)