# Churn prediction 25/26 - Project for Data Science and AI for Business
## Authors: Andreea Patarlageanu and Martin Lau

In [1]:
# We will put all imports here
import pandas as pd
import numpy as np


We will start with an exploratory data analysis in order to better understand the data and format it efficiently for later.

First, let's read the data and understand what is it about.

In [2]:
df = pd.read_parquet("Data/train.parquet")

In [3]:
df.shape

(17499636, 19)

In [35]:
df.head(5)

Unnamed: 0,userId,firstName,lastName,gender,registration,sessionId,itemInSession,ts,time,level,auth,location,userAgent,page,method,status,song,artist,length
0,1749042,Shlok,Johnson,1,2018-08-08 13:22:21,22683,278,1538352001000,2018-10-01 00:00:01,1,1,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,1,200,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,524.32934
992,1749042,Shlok,Johnson,1,2018-08-08 13:22:21,22683,279,1538352525000,2018-10-01 00:08:45,1,1,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,1,200,Monster (Album Version),Skillet,178.02404
1360,1749042,Shlok,Johnson,1,2018-08-08 13:22:21,22683,280,1538352703000,2018-10-01 00:11:43,1,1,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,1,200,Seven Nation Army,The White Stripes,232.61995
1825,1749042,Shlok,Johnson,1,2018-08-08 13:22:21,22683,281,1538352935000,2018-10-01 00:15:35,1,1,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,1,200,Under The Bridge (Album Version),Red Hot Chili Peppers,265.50812
2366,1749042,Shlok,Johnson,1,2018-08-08 13:22:21,22683,282,1538353200000,2018-10-01 00:20:00,1,1,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,1,200,Circlesong 6,Bobby McFerrin,471.69261


In [5]:
print(df.columns)
print()
print(df.dtypes)

Index(['status', 'gender', 'firstName', 'level', 'lastName', 'userId', 'ts',
       'auth', 'page', 'sessionId', 'location', 'itemInSession', 'userAgent',
       'method', 'length', 'song', 'artist', 'time', 'registration'],
      dtype='object')

status                    int64
gender                   object
firstName                object
level                    object
lastName                 object
userId                   object
ts                        int64
auth                     object
page                     object
sessionId                 int64
location                 object
itemInSession             int64
userAgent                object
method                   object
length                  float64
song                     object
artist                   object
time             datetime64[us]
registration     datetime64[us]
dtype: object


We see that the order of the columns is not good. Moreover, some types can be changed for a better model comprehension.

First, let's reorder the columns like following:
- *User identifiers*
- *session information*
- *timestamps*
- *user context*
- *action data*
- *content listened to*

In [6]:
order_columns = [
    'userId',
    'firstName', 
    'lastName',
    'gender',
    'registration',
    
    'sessionId',
    'itemInSession',
    
    'ts',
    'time',
    
    'level',
    'auth',
    'location',
    'userAgent',
    
    'page',
    'method',
    'status',
    
    'song',
    'artist',
    'length'
]

In [7]:
df = df[order_columns]

Now, let's inspect a bit more the content of the columns that we think it would make sense to change the type:

In [8]:
print(df['status'].unique())
print(df['level'].unique())
print(df['page'].unique())
print(df['method'].unique())
print(df['auth'].unique())
print(df['page'].unique())

[200 307 404]
['paid' 'free']
['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']
['PUT' 'GET']
['Logged In' 'Cancelled']
['NextSong' 'Downgrade' 'Help' 'Home' 'Thumbs Up' 'Add Friend'
 'Thumbs Down' 'Add to Playlist' 'Logout' 'About' 'Settings'
 'Save Settings' 'Cancel' 'Cancellation Confirmation' 'Submit Downgrade'
 'Roll Advert' 'Upgrade' 'Error' 'Submit Upgrade']


Now, let's change some of the columns in the proper types:

- gender should be binary: 0 for 'F' and 1 for 'M'
- the level, meaning the level of the subscription, can be free or paid so we set: 0 for 'free' and 1 for 'paid'
- the page action shows the actions of the users/pages visited. Since there are many categories, we will leave that column's type to be 'object'
- the method (HTTP request methods) we set to be 0 for GET (retrieving/viewing data - reading) and 1 for PUT (sending/updating data - writing, like saving a song)
- for 'auth', let's set 0 for 'Cancelled' and 1 for 'Logged In'

In [36]:
df["gender"] = df["gender"].map({'F':0, 'M':1})
df["level"] = df["level"].map({'free' : 0, 'paid': 1})
df["method"] = df["method"].map({'GET' : 0, 'PUT' : 1 } )
df["auth"] = df["auth"].map({'Cancelled' : 0, 'Logged In' : 1 } )

Moreover, we see that 'userId''s type is object. Let's see if there are values in this column which cannot be converted to int:

In [37]:
try:
    df['userId'].astype(int)
    print("Only integers in userId")
except Exception as e:
    print(f"Some non integer values in userId: {e}")

Only integers in userId


Good! Then we can convert the column 'userId' to int:

In [38]:
df['userId'] = df['userId'].astype(int)

Let's go again through the types of the columns:

In [39]:
df.dtypes

userId                    int64
firstName                object
lastName                 object
gender                  float64
registration     datetime64[us]
sessionId                 int64
itemInSession             int64
ts                        int64
time             datetime64[us]
level                   float64
auth                    float64
location                 object
userAgent                object
page                     object
method                  float64
status                    int64
song                     object
artist                   object
length                  float64
dtype: object

In [40]:
df.head()

Unnamed: 0,userId,firstName,lastName,gender,registration,sessionId,itemInSession,ts,time,level,auth,location,userAgent,page,method,status,song,artist,length
0,1749042,Shlok,Johnson,,2018-08-08 13:22:21,22683,278,1538352001000,2018-10-01 00:00:01,,,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,,200,Ich mache einen Spiegel - Dream Part 4,Popol Vuh,524.32934
992,1749042,Shlok,Johnson,,2018-08-08 13:22:21,22683,279,1538352525000,2018-10-01 00:08:45,,,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,,200,Monster (Album Version),Skillet,178.02404
1360,1749042,Shlok,Johnson,,2018-08-08 13:22:21,22683,280,1538352703000,2018-10-01 00:11:43,,,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,,200,Seven Nation Army,The White Stripes,232.61995
1825,1749042,Shlok,Johnson,,2018-08-08 13:22:21,22683,281,1538352935000,2018-10-01 00:15:35,,,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,,200,Under The Bridge (Album Version),Red Hot Chili Peppers,265.50812
2366,1749042,Shlok,Johnson,,2018-08-08 13:22:21,22683,282,1538353200000,2018-10-01 00:20:00,,,"Dallas-Fort Worth-Arlington, TX","""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebK...",NextSong,,200,Circlesong 6,Bobby McFerrin,471.69261


Good! Now, let's start inspecting the missing values.

In [41]:
df.isnull().sum()

userId                  0
firstName               0
lastName                0
gender           17499636
registration            0
sessionId               0
itemInSession           0
ts                      0
time                    0
level            17499636
auth             17499636
location                0
userAgent               0
page                    0
method           17499636
status                  0
song              3208203
artist            3208203
length            3208203
dtype: int64

We see that the columns 'song', 'artist' and 'length' have the same number of missing values. This might indicate the possibility of having the missing values in the same lines for all 3 columns. Let's check that:

In [42]:
song_missing = df['song'].isnull()
artist_missing = df['artist'].isnull()
length_missing = df['length'].isnull()

# Let's check if they all occur in the same rows:
print("Are all missing in the same rows? Answer: ", (song_missing == artist_missing).all() and (artist_missing == length_missing).all() )

Are all missing in the same rows? Answer:  True


So we see that all the missing values of the 3 columns appear in the same lines. Let's check what percentage of the data all these lines represent:

In [43]:
rows_missing_values = song_missing.sum()
percentage = rows_missing_values / df.shape[0] * 100
print(f"Percentage of rows with missing values: {percentage:.2f}%" )

Percentage of rows with missing values: 18.33%


#TODO: should we delete or keep??????????????????

Now, we will just make sure that the values in 'registration' and 'time' make sense (not too long ago, or inthe future):

In [44]:
print("Range of dates in the registration column:")
print(f"Earliest registration: {df['registration'].min()}")
print(f"Latest registration: {df['registration'].max()}")

print()

print("Range of dates in the time column:")
print(f"Earliest time: {df['time'].min()}")
print(f"Latest time: {df['time'].max()}")

Range of dates in the registration column:
Earliest registration: 2017-10-14 22:05:25
Latest registration: 2018-11-19 23:34:34

Range of dates in the time column:
Earliest time: 2018-10-01 00:00:01
Latest time: 2018-11-20 00:00:00


We see that the earliest time was almost one year after the earliest registration.

#TODO: what do we do in this case?????????????????????????????????????

It may also be helpful to see what the example submission looks like, to see what is expected.

In [50]:
target = pd.read_csv("Data/example_submission.csv")

In [51]:
target.head(10)

Unnamed: 0,id,target
0,1128274,0
1,1782451,1
2,1611542,1
3,1241663,0
4,1653104,1
5,1486416,1
6,1442381,1
7,1117734,1
8,1212570,1
9,1755776,1


This lets us see what we need to predict, and 

In [52]:
df_test = pd.read_parquet("Data/test.parquet")

True

In [55]:
len(df_test['userId'].unique())

2904