# Importing the libraries 

In [1]:
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd 

# Load Data

In [2]:
dataset=pd.read_csv('Social_Network_Ads.csv') 
dataset.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000,No
1,15810944,Male,35.0,20000,No
2,15668575,Female,,43000,No
3,15603246,Female,27.0,57000,No
4,15804002,Male,19.0,76000,No


In [3]:
dataset.describe()

Unnamed: 0,User ID,Age,EstimatedSalary
count,400.0,399.0,400.0
mean,15691540.0,37.684211,69742.5
std,71658.32,10.479726,34096.960282
min,15566690.0,18.0,15000.0
25%,15626760.0,30.0,43000.0
50%,15694340.0,37.0,70000.0
75%,15750360.0,46.0,88000.0
max,15815240.0,60.0,150000.0


In [4]:
x = dataset.iloc[:, 1:-1].values  # get all row data expect from the first and last column
y = dataset.iloc[:, -1].values  # get the last column depentant variable data for all rows

In [5]:
print(x[:10,:])

[['Male' 19.0 19000]
 ['Male' 35.0 20000]
 ['Female' nan 43000]
 ['Female' 27.0 57000]
 ['Male' 19.0 76000]
 ['Male' 27.0 58000]
 ['Female' 27.0 84000]
 ['Female' 32.0 150000]
 ['Male' 25.0 33000]
 ['Female' 35.0 65000]]


In [6]:
print(y[:10])

['No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No']


## Replacing missing data with mean 

In [7]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan,strategy='mean') 
# fit method will look for missing values and calulate mean for 1st and 2nd columns 
imputer.fit(x[:,1:3]) 
# transform method will replace missign value with calulated value in 1st(age) and 2nd(salary) columns
x[:,1:3]=imputer.transform(x[:,1:3])

In [8]:
print(x[:10,:])

[['Male' 19.0 19000.0]
 ['Male' 35.0 20000.0]
 ['Female' 37.68421052631579 43000.0]
 ['Female' 27.0 57000.0]
 ['Male' 19.0 76000.0]
 ['Male' 27.0 58000.0]
 ['Female' 27.0 84000.0]
 ['Female' 32.0 150000.0]
 ['Male' 25.0 33000.0]
 ['Female' 35.0 65000.0]]


## Encoding categorical data

One hot encoding turn country columns to 3(as there are 3 countires).It created binary vector for each country so that there is no order between these countries. Final output will be - France vector 1,0,0 Spain 0,1,0 and Germany 0,0,1


In [9]:
#encoding independent variable 

In [10]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder',OneHotEncoder(),[0])],remainder='passthrough') # Apply tranformation to change country to binary vector  Remainder keep age and salary 
x = np.array(ct.fit_transform(x)) #fitting and transforming together and force output to be numpy array 
print(x[:10,:])

[[0.0 1.0 19.0 19000.0]
 [0.0 1.0 35.0 20000.0]
 [1.0 0.0 37.68421052631579 43000.0]
 [1.0 0.0 27.0 57000.0]
 [0.0 1.0 19.0 76000.0]
 [0.0 1.0 27.0 58000.0]
 [1.0 0.0 27.0 84000.0]
 [1.0 0.0 32.0 150000.0]
 [0.0 1.0 25.0 33000.0]
 [1.0 0.0 35.0 65000.0]]


In [11]:
## Encoding dependent variable 

In [12]:
# use LabelEncoder to replace purchased (dependent variable) with 0 and 1 
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y= le.fit_transform(y)
print(y[:10])

[0 0 0 0 0 0 0 1 0 0]


# Splitting the dataset into training and test set  

In [13]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state = 0) # func returns train and test data. It takes dataset and then split size test_size =0.3 means 30% data is for test and rest for training and random_state 


In [14]:
print(x_train[:10,:])

[[0.0 1.0 26.0 15000.0]
 [0.0 1.0 60.0 102000.0]
 [1.0 0.0 38.0 112000.0]
 [0.0 1.0 40.0 107000.0]
 [1.0 0.0 42.0 53000.0]
 [0.0 1.0 35.0 59000.0]
 [0.0 1.0 48.0 41000.0]
 [1.0 0.0 48.0 134000.0]
 [1.0 0.0 38.0 113000.0]
 [0.0 1.0 29.0 148000.0]]


In [15]:
print(x_test[:10,:])

[[0.0 1.0 30.0 87000.0]
 [1.0 0.0 38.0 50000.0]
 [0.0 1.0 35.0 75000.0]
 [1.0 0.0 30.0 79000.0]
 [1.0 0.0 35.0 50000.0]
 [0.0 1.0 27.0 20000.0]
 [1.0 0.0 31.0 15000.0]
 [0.0 1.0 36.0 144000.0]
 [1.0 0.0 18.0 68000.0]
 [0.0 1.0 47.0 43000.0]]


In [16]:
print(y_train[:10])

[0 1 0 1 0 0 1 1 1 1]


In [17]:
print(y_test[:10])

[0 0 0 0 0 0 0 1 0 0]


# Feature scaling 

Please note that not all models need Feature scaling to be applied. Apply feature scaling after splitting the dataset. Test set is supposed to be new data set. Feature scaling will get mean and median of the data so its better to apply this after splitting data so to avoid data leakage   

 Main two feature scaling techniques are standardisation and normalisation

 Xstan =(x - mean(x))/(standard deviation(x)) > The resulting output value will be between -3 and +3
 Xnorm =(x - min(x))/(max(x) - min(x)) > The resulting output value will be between 0 and 1

Normalisation is recommended to be used for normal distribution while standardisation works well all the time. So it is better to go for standardisation.
We do not need to apply feature scaling on dummy variables as they are already encoding to 0 or 1 or binary vector so nothing extra to be done by standardisation which will convert the value to between -3 & 3. Also it will make us loose the intepretation of the dummy variable(here country and purchased col) i.e which country correspond to which after standardisation. Many say that applying standardisation on dummy variable may increase model performance but I have never seen anything which proves it.    

In [18]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# take all columns from age. fit computes mean and median of each value. transform apply formula to tranform them to Xstand
x_train[:,3:] = scaler.fit_transform(x_train[:,3:]) 
#Need to apply the same scaler on both train and test set to get same transformation 
x_test[:,3:] = scaler.transform(x_test[:,3:])
print(x_train[:10,:])

[[0.0 1.0 26.0 -1.5849702974485314]
 [0.0 1.0 60.0 0.9309867236544418]
 [1.0 0.0 38.0 1.2201771858501858]
 [0.0 1.0 40.0 1.0755819547523138]
 [1.0 0.0 42.0 -0.486046541104704]
 [0.0 1.0 35.0 -0.3125322637872576]
 [0.0 1.0 48.0 -0.8330750957395968]
 [1.0 0.0 48.0 1.8563962026808227]
 [1.0 0.0 38.0 1.2490962320697603]
 [0.0 1.0 29.0 2.261262849754864]]


In [19]:
print(x_test[:10,:])

[[0.0 1.0 30.0 0.4972010303608257]
 [1.0 0.0 38.0 -0.5728036797634273]
 [0.0 1.0 35.0 0.15017247572593284]
 [1.0 0.0 30.0 0.26584866060423046]
 [1.0 0.0 35.0 -0.5728036797634273]
 [0.0 1.0 27.0 -1.4403750663506594]
 [1.0 0.0 31.0 -1.5849702974485314]
 [0.0 1.0 36.0 2.1455866648765665]
 [1.0 0.0 18.0 -0.05226084781108797]
 [0.0 1.0 47.0 -0.7752370033004481]]


# Build Model 

## Build Logistic Regression Model

In [20]:
from sklearn.linear_model import LogisticRegression
regressor = LogisticRegression(penalty = 'l2', C = 1000, random_state=0)
regressor.fit(x_train,y_train) 

LogisticRegression(C=1000, random_state=0)

In [21]:
#predicting the test set results
y_pred = regressor.predict(x_test)

In [22]:
## Evaluate Model 

In [23]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, confusion_matrix, classification_report
r2_score(y_test,y_pred)

0.5183698672429762

In [24]:
mean_absolute_error(y_test,y_pred)

0.10833333333333334

In [25]:
mean_squared_error(y_test,y_pred)

0.10833333333333334

In [26]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

[[74  5]
 [ 8 33]]


In [27]:
cr = classification_report(y_test,y_pred)
print(cr)

              precision    recall  f1-score   support

           0       0.90      0.94      0.92        79
           1       0.87      0.80      0.84        41

    accuracy                           0.89       120
   macro avg       0.89      0.87      0.88       120
weighted avg       0.89      0.89      0.89       120

