## Importing the necessary Libraries 

In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder , OneHotEncoder
from sklearn.ensemble import RandomForestClassifier 
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import RandomForestClassifier 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
# Reading the dataset 
df=pd.read_csv('FinalChurn.csv')
df.head()

Unnamed: 0,voice.messages,intl.mins,intl.calls,intl.charge,day.mins,day.calls,day.charge,eve.mins,eve.calls,eve.charge,night.mins,night.calls,night.charge,customer.calls,churn,voice.plan,intl.plan
0,25,10.0,3,2.7,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,1,0,1,0
1,26,13.7,3,3.7,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,1,0,1,0
2,0,12.2,5,3.29,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,0,0,0,0
3,0,6.6,7,1.78,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,2,0,0,1
4,0,10.1,3,2.73,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,3,0,0,1


In [3]:
df.shape

(5000, 17)

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
voice.messages,5000.0,7.7552,13.546393,0.0,0.0,0.0,17.0,52.0
intl.mins,5000.0,10.26178,2.761396,0.0,8.5,10.3,12.0,20.0
intl.calls,5000.0,4.4352,2.456788,0.0,3.0,4.0,6.0,20.0
intl.charge,5000.0,2.771196,0.745514,0.0,2.3,2.78,3.24,5.4
day.mins,5000.0,180.2889,53.894699,0.0,143.7,180.1,216.2,351.5
day.calls,5000.0,100.0294,19.831197,0.0,87.0,100.0,113.0,165.0
day.charge,5000.0,30.653501,9.159936,0.0,24.43,30.625,36.75,59.76
eve.mins,5000.0,200.580326,50.433135,0.0,166.6,200.8,233.9,363.7
eve.calls,5000.0,100.191,19.826496,0.0,87.0,100.0,114.0,170.0
eve.charge,5000.0,17.054322,4.296843,0.0,14.14,17.09,19.9,30.91


In [5]:
df['churn'].value_counts()

0    4293
1     707
Name: churn, dtype: int64

    0  -> Not Churned  
    1  -> Churned

In [6]:
X= df.drop('churn', axis =1)
Y= df.churn

In [7]:
print(X)

      voice.messages  intl.mins  intl.calls  intl.charge  day.mins  day.calls  \
0                 25       10.0           3         2.70     265.1        110   
1                 26       13.7           3         3.70     161.6        123   
2                  0       12.2           5         3.29     243.4        114   
3                  0        6.6           7         1.78     299.4         71   
4                  0       10.1           3         2.73     166.7        113   
...              ...        ...         ...          ...       ...        ...   
4995              40        9.9           5         2.67     235.7        127   
4996               0       14.7           2         3.97     184.2         90   
4997               0       13.6           4         3.67     140.6         89   
4998               0        8.5           6         2.30     188.8         67   
4999              34        9.3          16         2.51     129.4        102   

      day.charge  eve.mins 

In [8]:
print(Y)

0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    1
4997    0
4998    0
4999    0
Name: churn, Length: 5000, dtype: int64


Splitting the data and performing OverSampling 

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, stratify=Y, random_state=2)

In [10]:
RS=RandomOverSampler()
X_train_rs, Y_train_rs = RS.fit_resample(X_train,Y_train)

In [11]:
print(X.shape, X_train_rs.shape, X_test.shape)

(5000, 16) (6868, 16) (1000, 16)


In [12]:
print(Y.shape,Y_train_rs.shape,Y_test.shape)

(5000,) (6868,) (1000,)


Training the Model 

In [13]:
RFC=RandomForestClassifier()

In [14]:
# training the Random Forest Cassifier 
model=RFC.fit(X_train_rs,Y_train_rs)

Model Evaluation 

In [15]:
# accuracy score on the training data
X_train_prediction = RFC.predict(X_train_rs)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train_rs)

In [16]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  1.0


In [17]:
# accuracy score on the test data
X_test_prediction = RFC.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [18]:
print('Accuracy score of the training data : ', test_data_accuracy)

Accuracy score of the training data :  0.952


In [19]:
input_data = (30,10,3,2.5,250,120,35.5,195,100,16,220,95,10,4,1,1)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = RFC.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person has not churned')
else:
  print('The person has churned')

[0]
The person has not churned




Saving the Trained Model  

In [20]:
import pickle 

In [21]:
filename = 'trained_model.sav'
pickle.dump(RFC,open(filename,'wb')) 

In [22]:
# Loading the saved model 
loaded_model = pickle.load(open('trained_model.sav','rb'))

In [23]:
input_data = (30,10,3,2.5,250,120,35.5,195,100,16,220,95,10,4,1,1)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person has not churned')
else:
  print('The person has churned')

[0]
The person has not churned


