In [1]:
import pandas as pd

In [2]:
train = pd.read_csv(r'dataset/train.csv')
test = pd.read_csv(r'dataset/test.csv')
sample = pd.read_csv(r'dataset/sample_submission.csv')

In [3]:
low_car = [x for x in train.columns if train[x].nunique() < 10 and train[x].dtypes == 'O']

In [4]:
def make_csv(predict, name):
    df = pd.DataFrame()
    df['customer_id'] = test['customer_id']
    df['churn_risk_score'] = predict
    
    df.to_csv(name, index=False)

### we will try to predict just using the low cardinality feature

In [5]:
train_low = train[low_car].copy()
test_low = test[low_car].copy()

In [6]:
train_low.isna().sum()

gender                             0
region_category                 5428
membership_category                0
joined_through_referral            0
preferred_offer_types            288
medium_of_operation                0
internet_option                    0
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
dtype: int64

In [7]:
train_low.region_category.unique()

array(['Village', 'City', 'Town', nan], dtype=object)

In [8]:
train_low.region_category.mode()

0    Town
dtype: object

In [9]:
train_low.region_category.fillna("Town", inplace=True)

In [10]:
train_low.isna().sum()

gender                            0
region_category                   0
membership_category               0
joined_through_referral           0
preferred_offer_types           288
medium_of_operation               0
internet_option                   0
used_special_discount             0
offer_application_preference      0
past_complaint                    0
complaint_status                  0
feedback                          0
dtype: int64

In [11]:
train_low.preferred_offer_types.unique()

array(['Gift Vouchers/Coupons', 'Credit/Debit Card Offers',
       'Without Offers', nan], dtype=object)

In [12]:
train_low.preferred_offer_types.mode()

0    Gift Vouchers/Coupons
dtype: object

In [13]:
train_low.preferred_offer_types.fillna("Gift Vouchers/Coupons", inplace=True)

In [14]:
train_low.isna().sum()

gender                          0
region_category                 0
membership_category             0
joined_through_referral         0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
dtype: int64

### fill the test low

In [15]:
test_low.isna().sum()

gender                             0
region_category                 2948
membership_category                0
joined_through_referral            0
preferred_offer_types            159
medium_of_operation                0
internet_option                    0
used_special_discount              0
offer_application_preference       0
past_complaint                     0
complaint_status                   0
feedback                           0
dtype: int64

In [16]:
test_low.region_category.mode()

0    Town
dtype: object

In [17]:
test_low.region_category.fillna("Town", inplace=True)

In [18]:
test_low.preferred_offer_types.mode()

0    Without Offers
dtype: object

In [19]:
test_low.preferred_offer_types.fillna("Without Offers", inplace=True)

In [20]:
test_low.isna().sum()

gender                          0
region_category                 0
membership_category             0
joined_through_referral         0
preferred_offer_types           0
medium_of_operation             0
internet_option                 0
used_special_discount           0
offer_application_preference    0
past_complaint                  0
complaint_status                0
feedback                        0
dtype: int64

In [21]:
from sklearn.preprocessing import LabelEncoder
train_low_fin = train_low.copy()
test_low_fin = test_low.copy()

lbl = LabelEncoder()

for col in low_car:
    train_low_fin[col] = lbl.fit_transform(train_low[col])
    test_low_fin[col] = lbl.transform(test_low[col])

In [22]:
train_low_fin

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,0,2,3,1,1,0,2,1,1,0,1,4
1,0,0,4,0,1,2,1,1,0,1,2,5
2,0,1,2,2,1,2,2,0,1,1,3,3
3,1,0,2,2,1,2,1,0,1,1,4,3
4,0,0,2,1,0,3,1,0,1,1,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...
36987,0,1,0,1,0,2,2,0,1,1,0,0
36988,0,1,0,1,2,3,2,1,0,0,1,1
36989,0,1,0,2,1,2,2,0,1,1,4,3
36990,1,2,3,1,1,3,1,1,1,0,1,0


In [23]:
test_low_fin

Unnamed: 0,gender,region_category,membership_category,joined_through_referral,preferred_offer_types,medium_of_operation,internet_option,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,0,2,4,1,2,3,2,1,0,0,1,2
1,1,2,1,1,2,2,0,1,0,0,1,3
2,0,1,5,1,1,1,1,1,0,1,3,0
3,1,1,5,2,0,0,0,1,0,1,0,7
4,0,1,2,2,2,3,1,0,1,1,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...
19914,1,1,1,1,2,2,0,0,1,0,1,2
19915,0,2,2,2,2,2,0,0,1,0,1,0
19916,1,1,2,1,0,2,1,0,1,1,4,0
19917,0,0,5,2,0,3,0,0,1,0,1,3


In [24]:
y = train.churn_risk_score.copy()

In [25]:
y

0        2
1        1
2        5
3        5
4        5
        ..
36987    4
36988    5
36989    4
36990    3
36991    2
Name: churn_risk_score, Length: 36992, dtype: int64

### try model

In [26]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(train_low_fin, y)

RandomForestRegressor(random_state=42)

In [27]:
low_pred = rf.predict(test_low_fin)

In [28]:
low_pred

array([2.91      , 3.26354762, 2.16571429, ..., 4.77      , 3.70516667,
       2.62      ])

In [29]:
a = 2.51
print(round(a))
print(round(low_pred[0]))

3
3


In [30]:
pred1 = []
for x in low_pred:
    pred1.append(round(x))

In [31]:
#make_csv(pred1, "submission1.csv")

In [32]:
first_sub = pd.read_csv('submission1.csv')

In [33]:
first_sub

Unnamed: 0,customer_id,churn_risk_score
0,fffe43004900440031003700300030003400,3
1,fffe43004900440031003900370037003300,3
2,fffe43004900440034003800360037003000,2
3,fffe43004900440036003200370033003400,4
4,fffe43004900440035003000370031003900,4
...,...,...
19914,fffe43004900440035003600330037003800,4
19915,fffe43004900440032003900370037003100,5
19916,fffe43004900440036003100310036003700,5
19917,fffe43004900440034003200330033003600,4


In [34]:
test

Unnamed: 0,customer_id,Name,age,gender,security_no,region_category,membership_category,joining_date,joined_through_referral,referral_id,...,days_since_last_login,avg_time_spent,avg_transaction_value,avg_frequency_login_days,points_in_wallet,used_special_discount,offer_application_preference,past_complaint,complaint_status,feedback
0,fffe43004900440031003700300030003400,Alethia Meints,50,F,OQJ1XAY,Village,Premium Membership,2015-11-02,No,xxxxxxxx,...,12,386.26,40721.44,7.0,733.830000,Yes,No,No,Not Applicable,Poor Product Quality
1,fffe43004900440031003900370037003300,Ming Lopez,41,M,OUQRPKO,Village,Gold Membership,2016-03-01,No,xxxxxxxx,...,11,37.80,9644.40,9.0,726.000000,Yes,No,No,Not Applicable,Poor Website
2,fffe43004900440034003800360037003000,Carina Flannigan,31,F,02J2RE7,Town,Silver Membership,2017-03-03,No,xxxxxxxx,...,18,215.36,3693.25,21.0,713.780000,Yes,No,Yes,Solved in Follow-up,No reason specified
3,fffe43004900440036003200370033003400,Kyung Wanner,64,M,5YEQIF1,Town,Silver Membership,2017-08-18,Yes,CID8941,...,-999,44.57,36809.56,11.0,744.970000,Yes,No,Yes,No Information Available,Too many ads
4,fffe43004900440035003000370031003900,Enola Gatto,16,F,100RYB5,Town,No Membership,2015-05-05,Yes,CID5690,...,6,349.88,40675.86,8.0,299.048351,No,Yes,Yes,Solved in Follow-up,Poor Website
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19914,fffe43004900440035003600330037003800,Kraig Peele,12,M,2V0HA0O,,Gold Membership,2015-01-25,No,xxxxxxxx,...,16,103.57,46279.35,18.0,708.120000,No,Yes,No,Not Applicable,Poor Product Quality
19915,fffe43004900440032003900370037003100,Damaris Sabol,40,F,VJGQD6Q,Village,No Membership,2017-12-31,Yes,CID45490,...,21,63.19,23466.26,Error,574.340000,No,Yes,No,Not Applicable,No reason specified
19916,fffe43004900440036003100310036003700,Loura Huckstep,55,M,ADE7LWA,Town,No Membership,2015-09-09,No,xxxxxxxx,...,18,68.72,17903.75,24.0,564.300000,No,Yes,Yes,Unsolved,No reason specified
19917,fffe43004900440034003200330033003600,Sharita Clubb,17,F,A35KUBS,City,Silver Membership,2016-04-17,Yes,CID37167,...,3,119.54,14057.09,22.0,606.340000,No,Yes,No,Not Applicable,Poor Website


In [36]:
train.churn_risk_score.value_counts()

 3    10424
 4    10185
 5     9827
 2     2741
 1     2652
-1     1163
Name: churn_risk_score, dtype: int64

In [37]:
first_sub.churn_risk_score.value_counts()

3    6643
4    5407
5    4147
1    2124
2    1530
0      68
Name: churn_risk_score, dtype: int64

## try nueral network

In [42]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(5, 2), random_state=1)

In [40]:
from sklearn.preprocessing import StandardScaler

scl = StandardScaler()

train_low_scl = scl.fit_transform(train_low_fin)
test_low_scl = scl.transform(test_low_fin)

In [43]:
clf.fit(train_low_scl, y)

MLPClassifier(hidden_layer_sizes=(5, 2), random_state=1)

In [45]:
import joblib as jb
name = 'nn_clf.sav'
#jb.dump(clf, name)

['nn_clf.sav']

In [46]:
nn_clf = jb.load(name)

In [47]:
pred2 = nn_clf.predict(test_low_scl)

In [48]:
pred2

array([3, 4, 3, ..., 5, 3, 2], dtype=int64)

In [49]:
#make_csv(pred2, 'submission2.csv')

### try random forest with scaled data

In [50]:
rf2 = RandomForestRegressor(n_estimators=100, random_state=42)
rf2.fit(train_low_scl, y)

RandomForestRegressor(random_state=42)

In [51]:
pred3 = rf2.predict(test_low_scl)

In [52]:
pred3

array([2.91      , 3.26354762, 2.39238095, ..., 4.77      , 3.70516667,
       2.62      ])

In [53]:
pred31 = []
for x in pred3:
    pred31.append(round(x))

In [55]:
#make_csv(pred31, 'submission3.csv')

### Try SGD Classificaion

In [56]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(max_iter=1000)

In [58]:
sgd.fit(train_low_scl, y)

SGDClassifier()

In [59]:
pred4 = sgd.predict(test_low_scl)

In [60]:
pred4

array([3, 5, 3, ..., 3, 3, 1], dtype=int64)

In [61]:
#make_csv(pred4, 'submission4.csv')