In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv('course_lead_scoring.csv')

In [3]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.head(5)

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In [4]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
df[categorical] = df[categorical].fillna('NA')

In [5]:
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
df[numerical] = df[numerical].fillna(0)

In [8]:
df['industry'].mode()

0    retail
Name: industry, dtype: object

In [9]:
df[numerical].corr(numeric_only=True)

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


In [6]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [7]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [8]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

In [9]:
del df_train['converted']
del df_val['converted']
del df_test['converted']

In [10]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, y_train)

In [11]:
mi = df_train[categorical].apply(mutual_info_churn_score)
mi.sort_values(ascending=False)

lead_source          0.035396
employment_status    0.012938
industry             0.011575
location             0.004464
dtype: float64

In [12]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [13]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)


In [14]:
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [29]:
model.coef_

array([[-1.77843867e-05, -1.47154423e-02,  3.39095225e-02,
         2.66248432e-03,  1.15238518e-02, -1.02527697e-01,
        -2.48510995e-02,  4.93604222e-02, -2.01258344e-02,
        -1.34214865e-02, -3.00232200e-03, -9.25991830e-03,
        -3.17957304e-02, -1.60513114e-02,  3.11339155e-01,
         5.12012528e-02,  2.01511698e-02, -1.20346284e-02,
        -1.16021521e-02, -1.15251880e-01,  7.95303436e-02,
        -2.99401329e-02,  3.95843295e-03, -1.14296944e-02,
        -1.12457415e-02, -5.59987025e-03,  8.26402635e-03,
         5.58598769e-03, -3.33967159e-02, -2.52837052e-02,
         4.53752887e-01]])

In [15]:
y_pred = model.predict_proba(X_val)[:,1]

In [16]:
churn_decision = (y_pred >= 0.5)

In [17]:
accuracy_score(y_val, churn_decision)

0.6996587030716723

In [28]:
y_train_pred = model.predict_proba(X_train)[:,1]
churn_train = (y_train_pred >= 0.5)
original_acc = accuracy_score(y_train, churn_train)
original_acc

0.7385844748858448

In [26]:
def feature_eliminate(column, df_train, y_train, categorical, numerical):
    df_new_train = df_train.drop(columns=[column])
    dv = DictVectorizer(sparse=False)
    categorical = [c for c in categorical if c != column]
    numerical = [n for n in numerical if n != column]
    
    train_dict = df_new_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_train)[:,1]
    churn_desicion = (y_pred >= 0.5)
    acc = accuracy_score(y_train, churn_desicion)
    print(column, ': ', acc)
    return acc

In [31]:
diff_list = []
for column in ['industry', 'employment_status', 'lead_score']:
    acc = feature_eliminate(column, df_train, y_train, categorical, numerical)
    diff = acc - original_acc
    diff_list.append(diff)
    print(diff)

industry :  0.7408675799086758
0.0022831050228310223
employment_status :  0.7351598173515982
-0.003424657534246589
lead_score :  0.7420091324200914
0.003424657534246589


In [33]:
min(diff_list, key=abs)

0.0022831050228310223

In [None]:
def model_reg(c, df_train, y_train):
    categorical = ['lead_source', 'industry', 'employment_status', 'location']
    numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']
    
    dv = DictVectorizer(sparse=False)

    train_dict = df_train[categorical + numerical].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[categorical + numerical].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict_proba(X_train)[:,1]
    churn_desicion = (y_pred >= 0.5)
    acc = accuracy_score(y_train, churn_desicion)
    print(column, ': ', acc)
    return acc