In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from os import listdir
from os.path import isfile, join
import tqdm

In [21]:
# the datasets are too large, do not push to github!
# download the file on the desktop
mypath = "data"
files = [mypath+'/'+str(f) for f in listdir(mypath) if isfile(join(mypath, f))]


Create data dictionary and move these 5 files into it.

Output of listdir(mypath) should be:

data/members_v3.csv.7z 

data/sample_submission_zero.csv.7z

data/train.csv.7z

data/transactions.csv.7z

data/user_logs.csv.7z

In [22]:
files

['data/members_v3.csv.7z',
 'data/sample_submission_zero.csv.7z',
 'data/train.csv.7z',
 'data/transactions.csv.7z',
 'data/user_logs.csv.7z']

In [42]:
if "unpacked_data" not in listdir('./'):
    os.makedirs("./unpacked_data")
else:
    print("Dictionary is already created")

Dictionary is already created


In [29]:
# unpack data
import py7zr


extract_path = "unpacked_data"
for file in tqdm.tqdm(files):
    with py7zr.SevenZipFile(file, mode='r') as z:
        data = z.extractall(extract_path)

100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [23:38<00:00, 283.70s/it]


In [33]:
user_data = pd.read_csv("unpacked_data/user_logs.csv")
member_dat = pd.read_csv("unpacked_data/members_v3.csv")
transactions_dat = pd.read_csv('unpacked_data/transactions.csv')

In [34]:
'''
Functions that use to change the datatype according to columns' largest values for memory saving。

For integer:
For example, the maximum number that int8 can store is 127, and the minimum is - 128;
if the largest values for such column is smaller than the value，
we change the data type to int8
Perform this for all columns include int.

For float:
We change all features to float32
Source: https://www.kaggle.com/jeru666/did-you-think-of-these-features/notebook
'''

def change_datatype(df):
    int_cols = list(df.select_dtypes(include=['int']).columns)
    for col in int_cols:
        if ((np.max(df[col]) <= 127) and(np.min(df[col] >= -128))):
            df[col] = df[col].astype(np.int8)
        elif ((np.max(df[col]) <= 32767) and(np.min(df[col] >= -32768))):
            df[col] = df[col].astype(np.int16)
        elif ((np.max(df[col]) <= 2147483647) and(np.min(df[col] >= -2147483648))):
            df[col] = df[col].astype(np.int32)
        else:
            df[col] = df[col].astype(np.int64)
            
def change_datatype_float(df):
    float_cols = list(df.select_dtypes(include=['float']).columns)
    for col in float_cols:
        df[col] = df[col].astype(np.float32)

In [None]:
mem = transactions_dat.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

change_datatype(transactions_dat)
change_datatype_float(transactions_dat)

mem = transactions_dat.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

In [None]:
mem = member_dat.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

change_datatype(member_dat)
change_datatype_float(member_dat)

mem = member_dat.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

In [None]:
mem = user_data.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

change_datatype(user_data)
change_datatype_float(user_data)

mem = user_data.memory_usage(index=True).sum()
print(mem/ 1024**2," MB")

### Merge data and explore correlation

In [2]:
train_dat = pd.read_csv('142A_data/train.csv')

In [None]:
train1 = pd.merge(train_dat, user_data, on = 'msno') 
train2 = pd.merge(train_dat, member_dat, on = 'msno') 
train3 = pd.merge(train_dat, transactions_dat, on = 'msno') 
    # system crack if merge all

In [None]:
train2['gender'].replace({'female':'1', 'male':'0'}, inplace = True)
train1.head()
train2.head()
train3.head()

In [None]:
corr = train1.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr = train2.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
corr = train3.corr()
corr.style.background_gradient(cmap='coolwarm')

## Logistic Regression Model

In [None]:
# Import package
from sklearn.linear_model import LogisticRegression

In [None]:
X = train1.iloc[:,2:]
y = train1.loc[:,['is_churn']]
lr = LogisticRegression(random_state=0).fit(X, y)
lr.predict(X)
lr.score(X, y) # Output: 0.9442647435443151

In [None]:
X = train2.iloc[:,2:]
y = train2.loc[:,['is_churn']]
lr = LogisticRegression(random_state=0).fit(X, y)
lr.predict(X)
lr.score(X, y) # Need to debug, dataset has Null value???

In [None]:
X = train3.iloc[:,2:]
y = train3.loc[:,['is_churn']]
lr = LogisticRegression(random_state=0).fit(X, y)
lr.predict(X)
lr.score(X, y) # Output: 0.9568365792473885