# 03-classification hw

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [129]:
#data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv'
#!wget $data

In [130]:
df = pd.read_csv("course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


In this dataset our desired target for classification task will be `converted` variable - has the client signed up to the platform or not. 

## Data preparation

* Check if the missing values are presented in the features.
* If there are missing values:
    * For caterogiral features, replace them with 'NA'
    * For numerical features, replace with with 0.0 

In [131]:
#Clean column names
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.columns

Index(['lead_source', 'industry', 'number_of_courses_viewed', 'annual_income',
       'employment_status', 'location', 'interaction_count', 'lead_score',
       'converted'],
      dtype='object')

In [132]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [133]:
categorical = []
numerical = []

for col in df.columns:
    print(col)
    print(df[col].dtype)  # pandas dtype
    check_type = df[col].dtype
    missing = df[col].isnull().sum()
    if check_type == object:
        categorical.append(col)
        if missing > 0: 
            df[col] = df[col].fillna('NA')
            print('Column with missing values and string dtype')
    else:
        numerical.append(col)
        if missing > 0:
            df[col] = df[col].fillna(0)
            print('Column with missing values and numeric dtype')
    print('---')

lead_source
object
Column with missing values and string dtype
---
industry
object
Column with missing values and string dtype
---
number_of_courses_viewed
int64
---
annual_income
float64
Column with missing values and numeric dtype
---
employment_status
object
Column with missing values and string dtype
---
location
object
Column with missing values and string dtype
---
interaction_count
int64
---
lead_score
float64
---
converted
int64
---


In [134]:
df.isnull().sum()

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

#### Question 1

In [135]:
df.industry.value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

#### Question 2
Correlation matrix for the numerical values

In [136]:
corr_matrix = round(df[numerical].corr(),3)
print(corr_matrix)

                          number_of_courses_viewed  annual_income  \
number_of_courses_viewed                     1.000          0.010   
annual_income                                0.010          1.000   
interaction_count                           -0.024          0.027   
lead_score                                  -0.005          0.016   
converted                                    0.436          0.053   

                          interaction_count  lead_score  converted  
number_of_courses_viewed             -0.024      -0.005      0.436  
annual_income                         0.027       0.016      0.053  
interaction_count                     1.000       0.010      0.375  
lead_score                            0.010       1.000      0.194  
converted                             0.375       0.194      1.000  


### Split the data

In [137]:
from sklearn.model_selection import train_test_split

In [138]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [139]:
len(df_train), len(df_val), len(df_test)

(876, 293, 293)

In [140]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [141]:
y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

#### Mutual information score

In [142]:
from sklearn.metrics import mutual_info_score

In [143]:
for col in categorical:
    print(col, mutual_info_score(df_train[col], y_train))

lead_source 0.03539624379726594
industry 0.011574521435657112
employment_status 0.012937677269442782
location 0.004464157884038034


#### One hot encoding

In [144]:
from sklearn.feature_extraction import DictVectorizer

In [145]:
dv = DictVectorizer(sparse=False)
numerical.remove('converted')
train_dict = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

### Model training
#### Question 4

In [146]:
from sklearn.linear_model import LogisticRegression

In [155]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [156]:
y_pred = model.predict(X_val)
accuracy_all = (y_val == y_pred).mean()
accuracy_all

np.float64(0.7064846416382252)

#### Question 5

In [149]:
features = categorical + numerical

for f in features:
    removed = features.copy()
    removed.remove(f)
    train_dict = df_train[removed].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    val_dict = df_val[removed].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_val)
    accuracy_removed = (y_val == y_pred).mean()
    print(f'Removed {f}, accuracy difference: {abs(accuracy_all - accuracy_removed)}')

Removed lead_source, accuracy difference: 0.0034129692832765013
Removed industry, accuracy difference: 0.0
Removed employment_status, accuracy difference: 0.0034129692832763903
Removed location, accuracy difference: 0.010238907849829393
Removed number_of_courses_viewed, accuracy difference: 0.14334470989761094
Removed annual_income, accuracy difference: 0.15358361774744034
Removed interaction_count, accuracy difference: 0.14334470989761094
Removed lead_score, accuracy difference: 0.0068259385665528916


#### Question 6

In [150]:
for c in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='lbfgs', C=c, max_iter=1000, random_state=42)
    model.fit(X_train, y_train) 
    y_pred = model.predict(X_val)
    print (c, (y_val == y_pred).mean())


0.01 0.7952218430034129
0.1 0.8088737201365188
1 0.8191126279863481
10 0.825938566552901
100 0.8225255972696246


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
