In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df=pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/course_lead_scoring.csv")
df.head()

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.8,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1


### Data preparation
##### Check if the missing values are presented in the features.
##### If there are missing values:
##### For caterogiral features, replace them with 'NA'
##### For numerical features, replace with with 0.0

In [3]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [4]:
df.dtypes


lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

In [5]:
# Separate categorical and numerical columns automatically
categorical_cols = df.select_dtypes(include=['object']).columns
numerical_cols = df.select_dtypes(include=['number']).columns

# Fill missing values
df[categorical_cols] = df[categorical_cols].fillna('NA')
df[numerical_cols] = df[numerical_cols].fillna(0.0)

In [6]:
df.isna().sum()


lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64

In [7]:
df.dtypes

lead_source                  object
industry                     object
number_of_courses_viewed      int64
annual_income               float64
employment_status            object
location                     object
interaction_count             int64
lead_score                  float64
converted                     int64
dtype: object

##### What is the most frequent observation (mode) for the column industry?

In [8]:
df['industry'].mode()[0]

'retail'

#### Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

##### What are the two features that have the biggest correlation?

##### interaction_count and lead_score
##### number_of_courses_viewed and lead_score
##### number_of_courses_viewed and interaction_count
##### annual_income and interaction_count
##### Only consider the pairs above when answering this question.

In [9]:
correlation_matrix = df.drop(columns=['converted']).corr(numeric_only=True)
correlation_matrix

Unnamed: 0,number_of_courses_viewed,annual_income,interaction_count,lead_score
number_of_courses_viewed,1.0,0.00977,-0.023565,-0.004879
annual_income,0.00977,1.0,0.027036,0.01561
interaction_count,-0.023565,0.027036,1.0,0.009888
lead_score,-0.004879,0.01561,0.009888,1.0


#### Split the data
##### Split your data in train/val/test sets with 60%/20%/20% distribution.
##### Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
##### Make sure that the target value y is not in your dataframe.

In [10]:
# Split into train (60%) and temp (40%)
df_train, df_temp = train_test_split(df, test_size=0.4, random_state=42)

# Split temp into validation (20%) and test (20%)
df_val, df_test = train_test_split(df_temp, test_size=0.5, random_state=42)



In [12]:
len(df_train), len(df_val), len(df_test)


(877, 292, 293)

In [13]:
y_train = df_train['converted']
y_val = df_val['converted']
y_test = df_test['converted']

# Delete target from the feature dataframes


In [None]:
# Delete target from the feature dataframes
del df_train['converted']
del df_val['converted']
del df_test['converted']
