# Machine Learning Project

## 1. Data description

We have a data about user behavior before subscription and after 3 weeks. We need to detect users who churned.

We have following features:  
* type of subscription
* locale of interface
* have news subscription
* number of books added to library
* number of finished books
* use or not promocode
* number of following shelves
* number of friends
* number of quotes
* number of likes
* length of sessions
* from what chanel subscriptions was buying
* e.t.c.

In [50]:
import pandas as pd
import numpy as np

df = pd.read_csv('churned_data.csv', index_col='user_id')
columns = df.columns
print('Number of features = {}'.format(len(columns)))
print('Number of samples = {}'.format(df.shape[0]))
# for i in columns:
#     print(i)

Number of features = 100
Number of samples = 21516


In [203]:
df.describe()

Unnamed: 0,kind,locale,subscription_news,added_to_lib_0_week,added_to_lib_1_week,added_to_lib_2_week,added_to_lib_3_week,finished_books_0_week,finished_books_1_week,finished_books_2_week,...,avg_pop_dyn_21,avg_pop_dyn_31,avg_pop_dyn_32,count_session_per_day_1,count_session_per_day_2,count_session_per_day_3,AndroidInApp,Braintree,InApp,churned
count,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,...,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0,21516.0
mean,0.261154,0.962307,0.969929,8.8872,5.854155,1.735546,1.375813,0.660578,0.649191,0.291318,...,116.034436,108.47143,286.8802,2.253295,1.408744,1.133262,0.103272,0.126975,0.769753,0.5
std,0.439274,0.190457,0.170786,32.212863,19.669411,8.193867,10.443027,5.896316,1.686822,1.413622,...,2101.691247,2084.790122,3769.629078,1.259408,1.479215,1.381644,0.304321,0.332953,0.421001,0.500012
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.999998,-0.999998,-0.999998,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.998555,-0.99881,0.0,1.333333,0.0,0.0,0.0,0.0,1.0,0.0
50%,0.0,1.0,1.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.916085,-0.968868,0.0,2.0,1.0,1.0,0.0,0.0,1.0,0.5
75%,1.0,1.0,1.0,6.0,5.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,3.0,2.166667,2.0,0.0,0.0,1.0,1.0
max,1.0,1.0,1.0,1480.0,870.0,425.0,1091.0,326.0,51.0,153.0,...,203030.0,176179.0,357155.0,9.857143,23.5,11.166667,1.0,1.0,1.0,1.0


In [105]:
# avg_session - median_session
df = pd.read_csv('churned_data.csv', index_col='user_id')
df['sessions_skewness_1'] = df['avg_sessions_1'] - df['median_sessions_1']
df['sessions_skewness_2'] = df['avg_sessions_2'] - df['median_sessions_2']
df['sessions_skewness_3'] = df['avg_sessions_3'] - df['median_sessions_3']

## 1. PySpark MLlib

In [1]:
from pyspark.mllib.classification import LabeledPoint

In [2]:
def parse_point(line):
    values = [float(x) for x in line.split(',')]
    return LabeledPoint(values[0], values[1:])

In [3]:
parsed_data = sc.textFile("churned_data_spark.csv").map(parse_point)

In [4]:
def TP_TN_FP_FN(prediction_and_labels):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    TP = prediction_and_labels.filter(lambda (x, y): x == 1 and x == y).count()
    TN = prediction_and_labels.filter(lambda (x, y): x == 0 and x == y).count()
    FP = prediction_and_labels.filter(lambda (x, y): x == 1 and x != y).count()
    FN = prediction_and_labels.filter(lambda (x, y): x == 0 and x != y).count()
    
    return [TP, TN, FP, FN]

In [70]:
def confusion_matrix(arr):
    print('{} --- {}'.format(arr[0], arr[2]))
    print('{} --- {}'.format(arr[3], arr[1]))

### SVM Algorithm

In [6]:
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [7]:
from pyspark.mllib.classification import SVMWithSGD, SVMModel

create 5 folds cross-validation

In [72]:
accuracy = []
for _ in xrange(5):
    tr_rdd, test_rdd = parsed_data.randomSplit([0.75, 0.25])
    SVM_model = SVMWithSGD.train(tr_rdd, iterations=100, step=1, regParam=0.01, miniBatchFraction=0.5)
    prediction_label = test_rdd.map(lambda p: (float(SVM_model.predict(p.features)), p.label))
    stat_arr = TP_TN_FP_FN(prediction_label)
    accuracy.append((stat_arr[0] + stat_arr[1]) / float(sum(stat_arr)))

In [73]:
print(accuracy)

[0.4928050831620258, 0.502851885924563, 0.49953139643861294, 0.5447636700648749, 0.4987724268177526]


In [74]:
print(np.mean(accuracy))

0.507744892482


In [75]:
confusion_matrix(stat_arr)

14 --- 5
2649 --- 2627


### Logistic Regression

In [178]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS, LogisticRegressionModel

In [179]:
accuracy = []
for _ in xrange(5):
    tr_rdd, test_rdd = parsed_data.randomSplit([0.75, 0.25])
    LR_model = LogisticRegressionWithLBFGS.train(tr_rdd)
    prediction_label = test_rdd.map(lambda p: (float(LR_model.predict(p.features)), p.label))
    stat_arr = TP_TN_FP_FN(prediction_label)
    accuracy.append((stat_arr[0] + stat_arr[1]) / float(sum(stat_arr)))

In [180]:
print(accuracy)

[0.6052290005562767, 0.5983712752174718, 0.609220241311003, 0.6026022304832713, 0.6130930489150423]


In [181]:
print(np.mean(accuracy))

0.605703159297


In [182]:
confusion_matrix(stat_arr)

1860 --- 1236
868 --- 1474


## 2. Using XGBoost

In [56]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score
from sklearn.cross_validation import train_test_split

In [57]:
df = pd.read_csv('churned_data.csv', index_col='user_id')
X_data = df.drop('churned', axis=1).values
y_data = df['churned'].values

In [58]:
accuracy = []
precision = []
recall = []

In [59]:
xgb_model = xgb.XGBClassifier(max_depth=12,
                              learning_rate=0.001,
                              n_estimators=50,
                              subsample=0.6,
                              colsample_bytree=1.0,
                              nthread=4)

In [60]:
for _ in xrange(5):
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.25)
    xgb_model.fit(X_train, y_train)
    y_test_pred = xgb_model.predict(X_test)
    accuracy.append(accuracy_score(y_test, y_test_pred))
    precision.append(precision_score(y_test, y_test_pred))
    recall.append(recall_score(y_test, y_test_pred))

In [61]:
print(accuracy)
print(precision)
print(recall)
print('Mean accuracy on 5-folds cross validation = {}'.format(np.mean(accuracy)))
print('Mean precision on 5-folds cross validation = {}'.format(np.mean(precision)))
print('Mean recall on 5-folds cross validation = {}'.format(np.mean(recall)))

[0.77579475738984938, 0.77876928797174194, 0.78750697155605132, 0.78304517568321252, 0.77858337981037362]
[0.73196537351715296, 0.73774586325320013, 0.73944954128440366, 0.73099596649084708, 0.73800623052959502]
[0.86053524312099505, 0.87099152230003685, 0.89258028792912514, 0.88704819277108438, 0.87127620448694376]
Mean accuracy on 5-folds cross validation = 0.780739914482
Mean precision on 5-folds cross validation = 0.735632595015
Mean recall on 5-folds cross validation = 0.876486290122


In [62]:
print(confusion_matrix(y_test, y_test_pred))

[[1819  841]
 [ 350 2369]]


## 3. Theanets

In [16]:
import theanets
import pandas as pd
import numpy as np

In [17]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [18]:
df = pd.read_csv('churned_data.csv', index_col='user_id')
X_data = df.drop('churned', axis=1).values
y_data = df['churned'].values

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.25)
print(X_train.shape[1])
y_train = y_train.astype(np.int32)
y_test = y_test.astype(np.int32)
y_validation = y_validation.astype(np.int32)

99


In [22]:
hidden = 50
simple_net = theanets.Classifier(layers=[99, hidden, 30, (2, 'softmax')])
simple_net.train(
    (X_train, y_train),
    (X_validation, y_validation),
    algo='nag',
    learning_rate=1e-1,
    momentum=0.8
)

y_predicted = simple_net.predict(X_test)
print('Confusion matrix')
print(confusion_matrix(y_predicted, y_test))
print('Accuracy score = {}'.format(accuracy_score(y_predicted, y_test)))

Confusion matrix
[[2155 2146]
 [   3    0]]
Accuracy score = 0.500697026022
