# Logistic Regression

In this notebook, we apply Logistic Regression to our data and we try to predict 'churn'.

In [1]:
# All imports will be here:
import pandas as pd
import numpy as np
from utils import import_and_transform
from utils import evaluate_model
from utils import aggregate
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, precision_recall_curve)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


We apply the 'import_and_transform' function on our dataset in order to get it into the preffered shape.

In [2]:
raw_data = pd.read_parquet("Data/train.parquet")

In [3]:
user_data = import_and_transform(raw_data)

Using Dataframe


In [4]:
user_data = aggregate(user_data)

Processed 19140 users
Churn rate: 22.31%


In [5]:
user_data.head(1)

Unnamed: 0,userId,gender,registration,level,num_sessions,max_item_in_session,ts_min,ts_max,avg_session_length_seconds,num_songs_played,unique_artists,total_length,days_active,membership_length,churned
0,1000025,1,2018-07-10 09:30:08,1,17,486,2018-10-02 08:59:29,2018-10-18 20:33:05,49322.882294,1662,1162,417296.59169,16,100,1


In [6]:
features_to_drop = [
    'registration',
    'ts_min',
    'ts_max',
]
user_data.set_index('userId', inplace=True)
user_data.drop(columns = features_to_drop, inplace = True)

In [12]:
user_data.head(1)

Unnamed: 0_level_0,gender,level,num_sessions,max_item_in_session,avg_session_length_seconds,num_songs_played,unique_artists,total_length,days_active,membership_length,churned
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1000025,1,1,17,486,49322.882294,1662,1162,417296.59169,16,100,1


In [13]:
# SPlit in train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    user_data.drop(columns = ['churned']),
    user_data['churned'],
    test_size = 0.2,
    random_state = 42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [14]:
log_reg = LogisticRegression(class_weight='balanced')

log_reg.fit(X_train_scaled, y_train)
y_pred = log_reg.predict(X_test_scaled)
accuracy = accuracy_score(y_pred, y_test)

print(accuracy)

0.7789968652037618


In [23]:
test_data = import_and_transform("Data/test.parquet")
test_data = aggregate(test_data)
test_data.set_index('userId', inplace=True)
test_data.drop(columns = features_to_drop, inplace = True)

Importing parquet file
Processed 2904 users
Churn rate: 0.00%


In [24]:
test_data.drop(columns = ['churned'], inplace = True)

In [25]:
test_data_scaled = pd.DataFrame(
    scaler.transform(test_data),
    index=test_data.index,
    columns=test_data.columns
)

In [26]:
evaluate_model(log_reg, test_data)

Submission saved to submission.csv




In [28]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

grid_search = GridSearchCV(
    LogisticRegression(class_weight='balanced', max_iter=1000),
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1
)
grid_search.fit(X_train, y_train)
print(f"Best params: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

Best params: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best CV score: 0.8419




In [29]:
evaluate_model(grid_search.best_estimator_, test_data)

Submission saved to submission.csv
